1/* Copyright (C) 2008-2015 Free Software Foundation, Inc.
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License as published by the Free
7Software Foundation; either version 3, or (at your option) any later
8version.
9
10GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11WARRANTY; without even the implied warranty of MERCHANTABILITY or
12FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13for more details.
14
15Under Section 7 of GPL version 3, you are granted additional
16permissions described in the GCC Runtime Library Exception, version
173.1, as published by the Free Software Foundation.
18
19You should have received a copy of the GNU General Public License and
20a copy of the GCC Runtime Library Exception along with this program;
21see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22<http://www.gnu.org/licenses/>.  */
23
24#include <spu_mfcio.h>
25#include <spu_internals.h>
26#include <spu_intrinsics.h>
27#include <spu_cache.h>
28
29extern unsigned long long __ea_local_store;
30extern char __cache_tag_array_size;
31
32#define LINE_SIZE 128
33#define TAG_MASK (LINE_SIZE - 1)
34
35#define WAYS 4
36#define SET_MASK ((int) &__cache_tag_array_size - LINE_SIZE)
37
38#define CACHE_LINES ((int) &__cache_tag_array_size /		\
39		     sizeof (struct __cache_tag_array) * WAYS)
40
41struct __cache_tag_array
42{
43  unsigned int tag_lo[WAYS];
44  unsigned int tag_hi[WAYS];
45  void *base[WAYS];
46  int reserved[WAYS];
47  vector unsigned short dirty_bits[WAYS];
48};
49
50extern struct __cache_tag_array __cache_tag_array[];
51extern char __cache[];
52
53/* In order to make the code seem a little cleaner, and to avoid having
54   64/32 bit ifdefs all over the place, we use macros.  */
55
56#ifdef __EA64__
57typedef unsigned long long addr;
58
59#define CHECK_TAG(_entry, _way, _tag)			\
60  ((_entry)->tag_lo[(_way)] == ((_tag) & 0xFFFFFFFF)	\
61   && (_entry)->tag_hi[(_way)] == ((_tag) >> 32))
62
63#define GET_TAG(_entry, _way) \
64  ((unsigned long long)(_entry)->tag_hi[(_way)] << 32	\
65   | (unsigned long long)(_entry)->tag_lo[(_way)])
66
67#define SET_TAG(_entry, _way, _tag)			\
68  (_entry)->tag_lo[(_way)] = (_tag) & 0xFFFFFFFF;	\
69  (_entry)->tag_hi[(_way)] = (_tag) >> 32
70
71#else /*__EA32__*/
72typedef unsigned long addr;
73
74#define CHECK_TAG(_entry, _way, _tag)			\
75  ((_entry)->tag_lo[(_way)] == (_tag))
76
77#define GET_TAG(_entry, _way)				\
78  ((_entry)->tag_lo[(_way)])
79
80#define SET_TAG(_entry, _way, _tag)			\
81  (_entry)->tag_lo[(_way)] = (_tag)
82
83#endif
84
85/* In GET_ENTRY, we cast away the high 32 bits,
86   as the tag is only in the low 32.  */
87
88#define GET_ENTRY(_addr)						   \
89  ((struct __cache_tag_array *)						   \
90   si_to_uint (si_a (si_and (si_from_uint ((unsigned int) (addr) (_addr)), \
91			     si_from_uint (SET_MASK)),			   \
92	       si_from_uint ((unsigned int) __cache_tag_array))))
93
94#define GET_CACHE_LINE(_addr, _way) \
95  ((void *) (__cache + ((_addr) & SET_MASK) * WAYS) + ((_way) * LINE_SIZE));
96
97#define CHECK_DIRTY(_vec) (si_to_uint (si_orx ((qword) (_vec))))
98#define SET_EMPTY(_entry, _way) ((_entry)->tag_lo[(_way)] = 1)
99#define CHECK_EMPTY(_entry, _way) ((_entry)->tag_lo[(_way)] == 1)
100
101#define LS_FLAG 0x80000000
102#define SET_IS_LS(_entry, _way) ((_entry)->reserved[(_way)] |= LS_FLAG)
103#define CHECK_IS_LS(_entry, _way) ((_entry)->reserved[(_way)] & LS_FLAG)
104#define GET_LRU(_entry, _way) ((_entry)->reserved[(_way)] & ~LS_FLAG)
105
106static int dma_tag = 32;
107
108static void
109__cache_evict_entry (struct __cache_tag_array *entry, int way)
110{
111  addr tag = GET_TAG (entry, way);
112
113  if (CHECK_DIRTY (entry->dirty_bits[way]) && !CHECK_IS_LS (entry, way))
114    {
115#ifdef NONATOMIC
116      /* Non-atomic writes.  */
117      unsigned int oldmask, mach_stat;
118      char *line = ((void *) 0);
119
120      /* Enter critical section.  */
121      mach_stat = spu_readch (SPU_RdMachStat);
122      spu_idisable ();
123
124      /* Issue DMA request.  */
125      line = GET_CACHE_LINE (entry->tag_lo[way], way);
126      mfc_put (line, tag, LINE_SIZE, dma_tag, 0, 0);
127
128      /* Wait for DMA completion.  */
129      oldmask = mfc_read_tag_mask ();
130      mfc_write_tag_mask (1 << dma_tag);
131      mfc_read_tag_status_all ();
132      mfc_write_tag_mask (oldmask);
133
134      /* Leave critical section.  */
135      if (__builtin_expect (mach_stat & 1, 0))
136	spu_ienable ();
137#else
138      /* Allocate a buffer large enough that we know it has 128 bytes
139         that are 128 byte aligned (for DMA). */
140
141      char buffer[LINE_SIZE + 127];
142      qword *buf_ptr = (qword *) (((unsigned int) (buffer) + 127) & ~127);
143      qword *line = GET_CACHE_LINE (entry->tag_lo[way], way);
144      qword bits;
145      unsigned int mach_stat;
146
147      /* Enter critical section.  */
148      mach_stat = spu_readch (SPU_RdMachStat);
149      spu_idisable ();
150
151      do
152	{
153	  /* We atomically read the current memory into a buffer
154	     modify the dirty bytes in the buffer, and write it
155	     back. If writeback fails, loop and try again.  */
156
157	  mfc_getllar (buf_ptr, tag, 0, 0);
158	  mfc_read_atomic_status ();
159
160	  /* The method we're using to write 16 dirty bytes into
161	     the buffer at a time uses fsmb which in turn uses
162	     the least significant 16 bits of word 0, so we
163	     load the bits and rotate so that the first bit of
164	     the bitmap is in the first bit that fsmb will use.  */
165
166	  bits = (qword) entry->dirty_bits[way];
167	  bits = si_rotqbyi (bits, -2);
168
169	  /* Si_fsmb creates the mask of dirty bytes.
170	     Use selb to nab the appropriate bits.  */
171	  buf_ptr[0] = si_selb (buf_ptr[0], line[0], si_fsmb (bits));
172
173	  /* Rotate to next 16 byte section of cache.  */
174	  bits = si_rotqbyi (bits, 2);
175
176	  buf_ptr[1] = si_selb (buf_ptr[1], line[1], si_fsmb (bits));
177	  bits = si_rotqbyi (bits, 2);
178	  buf_ptr[2] = si_selb (buf_ptr[2], line[2], si_fsmb (bits));
179	  bits = si_rotqbyi (bits, 2);
180	  buf_ptr[3] = si_selb (buf_ptr[3], line[3], si_fsmb (bits));
181	  bits = si_rotqbyi (bits, 2);
182	  buf_ptr[4] = si_selb (buf_ptr[4], line[4], si_fsmb (bits));
183	  bits = si_rotqbyi (bits, 2);
184	  buf_ptr[5] = si_selb (buf_ptr[5], line[5], si_fsmb (bits));
185	  bits = si_rotqbyi (bits, 2);
186	  buf_ptr[6] = si_selb (buf_ptr[6], line[6], si_fsmb (bits));
187	  bits = si_rotqbyi (bits, 2);
188	  buf_ptr[7] = si_selb (buf_ptr[7], line[7], si_fsmb (bits));
189	  bits = si_rotqbyi (bits, 2);
190
191	  mfc_putllc (buf_ptr, tag, 0, 0);
192	}
193      while (mfc_read_atomic_status ());
194
195      /* Leave critical section.  */
196      if (__builtin_expect (mach_stat & 1, 0))
197	spu_ienable ();
198#endif
199    }
200
201  /* In any case, marking the lo tag with 1 which denotes empty.  */
202  SET_EMPTY (entry, way);
203  entry->dirty_bits[way] = (vector unsigned short) si_from_uint (0);
204}
205
206void
207__cache_evict (__ea void *ea)
208{
209  addr tag = (addr) ea & ~TAG_MASK;
210  struct __cache_tag_array *entry = GET_ENTRY (ea);
211  int i = 0;
212
213  /* Cycles through all the possible ways an address could be at
214     and evicts the way if found.  */
215
216  for (i = 0; i < WAYS; i++)
217    if (CHECK_TAG (entry, i, tag))
218      __cache_evict_entry (entry, i);
219}
220
221static void *
222__cache_fill (int way, addr tag)
223{
224  unsigned int oldmask, mach_stat;
225  char *line = ((void *) 0);
226
227  /* Reserve our DMA tag.  */
228  if (dma_tag == 32)
229    dma_tag = mfc_tag_reserve ();
230
231  /* Enter critical section.  */
232  mach_stat = spu_readch (SPU_RdMachStat);
233  spu_idisable ();
234
235  /* Issue DMA request.  */
236  line = GET_CACHE_LINE (tag, way);
237  mfc_get (line, tag, LINE_SIZE, dma_tag, 0, 0);
238
239  /* Wait for DMA completion.  */
240  oldmask = mfc_read_tag_mask ();
241  mfc_write_tag_mask (1 << dma_tag);
242  mfc_read_tag_status_all ();
243  mfc_write_tag_mask (oldmask);
244
245  /* Leave critical section.  */
246  if (__builtin_expect (mach_stat & 1, 0))
247    spu_ienable ();
248
249  return (void *) line;
250}
251
252static void
253__cache_miss (__ea void *ea, struct __cache_tag_array *entry, int way)
254{
255
256  addr tag = (addr) ea & ~TAG_MASK;
257  unsigned int lru = 0;
258  int i = 0;
259  int idx = 0;
260
261  /* If way > 4, then there are no empty slots, so we must evict
262     the least recently used entry. */
263  if (way >= 4)
264    {
265      for (i = 0; i < WAYS; i++)
266	{
267	  if (GET_LRU (entry, i) > lru)
268	    {
269	      lru = GET_LRU (entry, i);
270	      idx = i;
271	    }
272	}
273      __cache_evict_entry (entry, idx);
274      way = idx;
275    }
276
277  /* Set the empty entry's tag and fill it's cache line. */
278
279  SET_TAG (entry, way, tag);
280  entry->reserved[way] = 0;
281
282  /* Check if the address is just an effective address within the
283     SPU's local store. */
284
285  /* Because the LS is not 256k aligned, we can't do a nice and mask
286     here to compare, so we must check the whole range.  */
287
288  if ((addr) ea >= (addr) __ea_local_store
289      && (addr) ea < (addr) (__ea_local_store + 0x40000))
290    {
291      SET_IS_LS (entry, way);
292      entry->base[way] =
293	(void *) ((unsigned int) ((addr) ea -
294				  (addr) __ea_local_store) & ~0x7f);
295    }
296  else
297    {
298      entry->base[way] = __cache_fill (way, tag);
299    }
300}
301
302void *
303__cache_fetch_dirty (__ea void *ea, int n_bytes_dirty)
304{
305#ifdef __EA64__
306  unsigned int tag_hi;
307  qword etag_hi;
308#endif
309  unsigned int tag_lo;
310  struct __cache_tag_array *entry;
311
312  qword etag_lo;
313  qword equal;
314  qword bit_mask;
315  qword way;
316
317  /* This first chunk, we merely fill the pointer and tag.  */
318
319  entry = GET_ENTRY (ea);
320
321#ifndef __EA64__
322  tag_lo =
323    si_to_uint (si_andc
324		(si_shufb
325		 (si_from_uint ((addr) ea), si_from_uint (0),
326		  si_from_uint (0x00010203)), si_from_uint (TAG_MASK)));
327#else
328  tag_lo =
329    si_to_uint (si_andc
330		(si_shufb
331		 (si_from_ullong ((addr) ea), si_from_uint (0),
332		  si_from_uint (0x04050607)), si_from_uint (TAG_MASK)));
333
334  tag_hi =
335    si_to_uint (si_shufb
336		(si_from_ullong ((addr) ea), si_from_uint (0),
337		 si_from_uint (0x00010203)));
338#endif
339
340  /* Increment LRU in reserved bytes.  */
341  si_stqd (si_ai (si_lqd (si_from_ptr (entry), 48), 1),
342	   si_from_ptr (entry), 48);
343
344missreturn:
345  /* Check if the entry's lo_tag is equal to the address' lo_tag.  */
346  etag_lo = si_lqd (si_from_ptr (entry), 0);
347  equal = si_ceq (etag_lo, si_from_uint (tag_lo));
348#ifdef __EA64__
349  /* And the high tag too.  */
350  etag_hi = si_lqd (si_from_ptr (entry), 16);
351  equal = si_and (equal, (si_ceq (etag_hi, si_from_uint (tag_hi))));
352#endif
353
354  if ((si_to_uint (si_orx (equal)) == 0))
355    goto misshandler;
356
357  if (n_bytes_dirty)
358    {
359      /* way = 0x40,0x50,0x60,0x70 for each way, which is also the
360         offset of the appropriate dirty bits.  */
361      way = si_shli (si_clz (si_gbb (equal)), 2);
362
363      /* To create the bit_mask, we set it to all 1s (uint -1), then we
364         shift it over (128 - n_bytes_dirty) times.  */
365
366      bit_mask = si_from_uint (-1);
367
368      bit_mask =
369	si_shlqby (bit_mask, si_from_uint ((LINE_SIZE - n_bytes_dirty) / 8));
370
371      bit_mask =
372	si_shlqbi (bit_mask, si_from_uint ((LINE_SIZE - n_bytes_dirty) % 8));
373
374      /* Rotate it around to the correct offset.  */
375      bit_mask =
376	si_rotqby (bit_mask,
377		   si_from_uint (-1 * ((addr) ea & TAG_MASK) / 8));
378
379      bit_mask =
380	si_rotqbi (bit_mask,
381		   si_from_uint (-1 * ((addr) ea & TAG_MASK) % 8));
382
383      /* Update the dirty bits.  */
384      si_stqx (si_or (si_lqx (si_from_ptr (entry), way), bit_mask),
385	       si_from_ptr (entry), way);
386    };
387
388  /* We've definitely found the right entry, set LRU (reserved) to 0
389     maintaining the LS flag (MSB).  */
390
391  si_stqd (si_andc
392	   (si_lqd (si_from_ptr (entry), 48),
393	    si_and (equal, si_from_uint (~(LS_FLAG)))),
394	   si_from_ptr (entry), 48);
395
396  return (void *)
397    si_to_uint (si_a
398		(si_orx
399		 (si_and (si_lqd (si_from_ptr (entry), 32), equal)),
400		 si_from_uint (((unsigned int) (addr) ea) & TAG_MASK)));
401
402misshandler:
403  equal = si_ceqi (etag_lo, 1);
404  __cache_miss (ea, entry, (si_to_uint (si_clz (si_gbb (equal))) - 16) >> 2);
405  goto missreturn;
406}
407
408void *
409__cache_fetch (__ea void *ea)
410{
411  return __cache_fetch_dirty (ea, 0);
412}
413
414void
415__cache_touch (__ea void *ea __attribute__ ((unused)))
416{
417  /* NO-OP for now.  */
418}
419
420void __cache_flush (void) __attribute__ ((destructor));
421void
422__cache_flush (void)
423{
424  struct __cache_tag_array *entry = __cache_tag_array;
425  unsigned int i;
426  int j;
427
428  /* Cycle through each cache entry and evict all used ways.  */
429
430  for (i = 0; i < CACHE_LINES / WAYS; i++)
431    {
432      for (j = 0; j < WAYS; j++)
433	if (!CHECK_EMPTY (entry, j))
434	  __cache_evict_entry (entry, j);
435
436      entry++;
437    }
438}
439