aboutsummaryrefslogtreecommitdiff
path: root/arch/arm/mach-ux500/dcache.c
blob: b117d4e8283d345b9e413063a8670f6141096a6e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
/*
 * Copyright (C) ST-Ericsson SA 2011
 *
 * Cache handler integration and data cache helpers.
 *
 * Author: Johan Mossberg <johan.xx.mossberg@stericsson.com>
 * for ST-Ericsson.
 *
 * License terms: GNU General Public License (GPL), version 2.
 */

#include <linux/dma-mapping.h>

#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/outercache.h>
#include <asm/system.h>

/*
 * Values are derived from measurements on HREFP_1.1_V32_OM_S10 running
 * u8500-android-2.2_r1.1_v0.21.
 *
 * A lot of time can be spent trying to figure out the perfect breakpoints but
 * for now I've chosen the following simple way.
 *
 * breakpoint = best_case + (worst_case - best_case) * 0.666
 * The breakpoint is moved slightly towards the worst case because a full
 * clean/flush affects the entire system so we should be a bit careful.
 *
 * BEST CASE:
 * Best case is that the cache is empty and the system is idling. The case
 * where the cache contains only targeted data could be better in some cases
 * but it's hard to do measurements and calculate on that case so I choose the
 * easier alternative.
 *
 * inner_clean_breakpoint = time_2_range_clean_on_empty_cache(
 *					complete_clean_on_empty_cache_time)
 * inner_flush_breakpoint = time_2_range_flush_on_empty_cache(
 *					complete_flush_on_empty_cache_time)
 *
 * outer_clean_breakpoint = time_2_range_clean_on_empty_cache(
 *					complete_clean_on_empty_cache_time)
 * outer_flush_breakpoint = time_2_range_flush_on_empty_cache(
 *					complete_flush_on_empty_cache_time)
 *
 * WORST CASE:
 * Worst case is that the cache is filled with dirty non targeted data that
 * will be used after the synchronization and the system is under heavy load.
 *
 * inner_clean_breakpoint = time_2_range_clean_on_empty_cache(
 *				complete_clean_on_full_cache_time * 1.5)
 * Times 1.5 because it runs on both cores half the time.
 * inner_flush_breakpoint = time_2_range_flush_on_empty_cache(
 *				complete_flush_on_full_cache_time * 1.5 +
 *					complete_flush_on_full_cache_time / 2)
 * Plus "complete_flush_on_full_cache_time / 2" because all data has to be read
 * back, here we assume that both cores can fill their cache simultaneously
 * (seems to be the case as operations on full and empty inner cache takes
 * roughly the same amount of time ie the bus to outer is not the bottle neck).
 *
 * outer_clean_breakpoint = time_2_range_clean_on_empty_cache(
 *					complete_clean_on_full_cache_time +
 *					(complete_clean_on_full_cache_time -
 *					complete_clean_on_empty_cache_time))
 * Plus "(complete_flush_on_full_cache_time -
 * complete_flush_on_empty_cache_time)" because no one else can work when we
 * hog the bus with our unecessary transfer.
 * outer_flush_breakpoint = time_2_range_flush_on_empty_cache(
 *					complete_flush_on_full_cache_time * 2 +
 *					(complete_flush_on_full_cache_time -
 *				complete_flush_on_empty_cache_time) * 2)
 *
 * These values might have to be updated if changes are made to the CPU, L2$,
 * memory bus or memory.
 */
/* 28930 */
static const u32 inner_clean_breakpoint = 21324 + (32744 - 21324) * 0.666;
/* 36224 */
static const u32 inner_flush_breakpoint = 21324 + (43697 - 21324) * 0.666;
/* 254069 */
static const u32 outer_clean_breakpoint = 68041 + (347363 - 68041) * 0.666;
/* 485414 */
static const u32 outer_flush_breakpoint = 68041 + (694727 - 68041) * 0.666;

static void __clean_inner_dcache_all(void *param);
static void clean_inner_dcache_all(void);

static void __flush_inner_dcache_all(void *param);
static void flush_inner_dcache_all(void);

static bool is_cache_exclusive(void);

void drain_cpu_write_buf(void)
{
	dsb();
	outer_cache.sync();
}

void clean_cpu_dcache(void *vaddr, u32 paddr, u32 length, bool inner_only,
						bool *cleaned_everything)
{
	/*
	 * There is no problem with exclusive caches here as the Cortex-A9
	 * documentation (8.1.4. Exclusive L2 cache) says that when a dirty
	 * line is moved from L2 to L1 it is first written to mem. Because
	 * of this there is no way a line can avoid the clean by jumping
	 * between the cache levels.
	 */
	*cleaned_everything = true;

	if (length < inner_clean_breakpoint) {
		/* Inner clean range */
		dmac_map_area(vaddr, length, DMA_TO_DEVICE);
		*cleaned_everything = false;
	} else {
		clean_inner_dcache_all();
	}

	if (!inner_only) {
		/*
		 * There is currently no outer_cache.clean_all() so we use
		 * flush instead, which is ok as clean is a subset of flush.
		 * Clean range and flush range take the same amount of time
		 * so we can use outer_flush_breakpoint here.
		 */
		if (length < outer_flush_breakpoint) {
			outer_cache.clean_range(paddr, paddr + length);
			*cleaned_everything = false;
		} else {
			outer_cache.flush_all();
		}
	}
}

void flush_cpu_dcache(void *vaddr, u32 paddr, u32 length, bool inner_only,
						bool *flushed_everything)
{
	/*
	 * There might still be stale data in the caches after this call if the
	 * cache levels are exclusive. The follwing can happen.
	 * 1. Clean L1 moves the data to L2.
	 * 2. Speculative prefetch, preemption or loads on the other core moves
	 * all the data back to L1, any dirty data will be written to mem as a
	 * result of this.
	 * 3. Flush L2 does nothing as there is no targeted data in L2.
	 * 4. Flush L1 moves the data to L2. Notice that this does not happen
	 * when the cache levels are non-exclusive as clean pages are not
	 * written to L2 in that case.
	 * 5. Stale data is still present in L2!
	 * I see two possible solutions, don't use exclusive caches or
	 * (temporarily) disable prefetching to L1, preeemption and the other
	 * core.
	 *
	 * A situation can occur where the operation does not seem atomic from
	 * the other core's point of view, even on a non-exclusive cache setup.
	 * Replace step 2 in the previous scenarion with a write from the other
	 * core. The other core will write on top of the old data but the
	 * result will not be written to memory. One would expect either that
	 * the write was performed on top of the old data and was written to
	 * memory (the write occured before the flush) or that the write was
	 * performed on top of the new data and was not written to memory (the
	 * write occured after the flush). The same problem can occur with one
	 * core if kernel preemption is enabled. The solution is to
	 * (temporarily) disable the other core and preemption. I can't think
	 * of any situation where this would be a problem and disabling the
	 * other core for the duration of this call is mighty expensive so for
	 * now I just ignore the problem.
	 */

	*flushed_everything = true;

	if (!inner_only) {
		/*
		 * Beautiful solution for the exclusive problems :)
		 */
		if (is_cache_exclusive())
			panic("%s can't handle exclusive CPU caches\n",
								__func__);

		if (length < inner_clean_breakpoint) {
			/* Inner clean range */
			dmac_map_area(vaddr, length, DMA_TO_DEVICE);
			*flushed_everything = false;
		} else {
			clean_inner_dcache_all();
		}

		if (length < outer_flush_breakpoint) {
			outer_cache.flush_range(paddr, paddr + length);
			*flushed_everything = false;
		} else {
			outer_cache.flush_all();
		}
	}

	if (length < inner_flush_breakpoint) {
		/* Inner flush range */
		dmac_flush_range(vaddr, (void *)((u32)vaddr + length));
		*flushed_everything = false;
	} else {
		flush_inner_dcache_all();
	}
}

bool speculative_data_prefetch(void)
{
	return true;
}

u32 get_dcache_granularity(void)
{
	return 32;
}

/*
 * Local functions
 */

static void __clean_inner_dcache_all(void *param)
{
	__cpuc_clean_dcache_all();
}

static void clean_inner_dcache_all(void)
{
	on_each_cpu(__clean_inner_dcache_all, NULL, 1);
}

static void __flush_inner_dcache_all(void *param)
{
	__cpuc_flush_dcache_all();
}

static void flush_inner_dcache_all(void)
{
	on_each_cpu(__flush_inner_dcache_all, NULL, 1);
}

static bool is_cache_exclusive(void)
{
	static const u32 CA9_ACTLR_EXCL = 0x80;

	u32 armv7_actlr;

	asm (
		"mrc	p15, 0, %0, c1, c0, 1"
		: "=r" (armv7_actlr)
	);

	if (armv7_actlr & CA9_ACTLR_EXCL)
		return true;
	else
		return false;
}