
/*--------------------------------------------------------------------*/
/*--- Cache simulation.                                            ---*/
/*---                                                        sim.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of Cachegrind, a Valgrind skin for cache
   profiling programs.

   Copyright (C) 2002 Nicholas Nethercote
      njn25@cam.ac.uk

   Modified for Callgrind-Skin, Josef Weidendorfer

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

   The GNU General Public License is contained in the file COPYING.
*/

#include "global.h"


/* Notes:
  - simulates a write-allocate cache
  - (block --> set) hash function uses simple bit selection
  - handling of references straddling two cache blocks:
      - counts as only one cache access (not two)
      - both blocks hit                  --> one hit
      - one block hits, the other misses --> one miss
      - both blocks miss                 --> one miss (not two)
*/

/* Cache configuration */
typedef struct {
    int size;       /* bytes */ 
    int assoc;
    int line_size;  /* bytes */ 
    Bool sectored;  /* prefetch nearside cacheline on read */
} cache_t;

/* additional structures for cache use info, separated
 * according usage frequency:
 * - line_loaded : pointer to cost center of instruction 
 *                 which loaded the line into cache.
 *                 Needed to increment counters when line is evicted.
 * - line_use    : updated on every access
 */
typedef struct {
  UInt count;
  UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
} line_use;

typedef struct {
  Addr memline, iaddr;
  line_use* dep_use; /* point to higher-level cacheblock for this memline */
  ULong* use_base;
} line_loaded;  

/* Cache state */
typedef struct {
   char*        name;
   int          size;                   /* bytes */
   int          assoc;
   int          line_size;              /* bytes */
   Bool         sectored;  /* prefetch nearside cacheline on read */
   int          sets;
   int          sets_min_1;
   int          assoc_bits;
   int          line_size_bits;
   int          tag_shift;
   int          tag_mask;
   char         desc_line[128];
   int*         tags;

  /* for cache use */
   int          line_size_mask;
   int*         line_start_mask;
   int*         line_end_mask;
   line_loaded* loaded;
   line_use*    use;
} cache_t2;

/* Cache states. */
static cache_t2 I1, D1, L2;

/* Lower bits of cache tags are used as flags for a cache line */
#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
#define CACHELINE_DIRTY    1

/* Cache access types */
enum AccType { Read = 0, Write = CACHELINE_DIRTY };

/* Cache simulator Options */
static Bool clo_simulate_writeback = False;
static Bool clo_simulate_hwpref = False;
static Bool clo_simulate_cache = False;
static Bool clo_simulate_sectors = False;
static Bool clo_collect_cacheuse = False;

/* Following global vars are setup before by
 *  setup_bbcc()/cachesim_after_bbsetup():
 *
 * - Addr   bb_base     (instruction start address of original BB)
 * - ULong* cost_base   (start of cost array for BB)
 * - BBCC*  nonskipped  (only != 0 when in a function not skipped)
 */

/* Offset to events in event set, used in log_* functions */
static Int off_D0_Ir;
static Int off_D1r_Ir;
static Int off_D1r_Dr;
static Int off_D1w_Ir;
static Int off_D1w_Dw;
static Int off_D2_Ir;
static Int off_D2_Dr;
static Int off_D2_Dw;

static Addr   bb_base;
static ULong* cost_base;
static InstrInfo* current_ii;

/* Cache use offsets */
/* FIXME: The offsets are only correct because all eventsets get
 * the "Use" set added first !
 */
static Int off_I1_TUseCount  = 0;
static Int off_I1_SLossCount  = 1;
static Int off_D1_TUseCount  = 0;
static Int off_D1_SLossCount  = 1;
static Int off_L2_TUseCount  = 2;
static Int off_L2_SLossCount  = 3;



static void cachesim_clearcache(cache_t2* c)
{
  Int i;

  for (i = 0; i < c->sets * c->assoc; i++)
    c->tags[i] = 0;
  if (c->use) {
    for (i = 0; i < c->sets * c->assoc; i++) {
      c->loaded[i].memline  = 0;
      c->loaded[i].use_base = 0;
      c->loaded[i].dep_use = 0;
      c->loaded[i].iaddr = 0;
      c->use[i].mask    = 0;
      c->use[i].count   = 0;
      c->tags[i] = i % c->assoc; /* init lower bits as pointer */
    }
  }
}

/* By this point, the size/assoc/line_size has been checked. */
static void cachesim_initcache(cache_t config, cache_t2* c)
{
   c->size      = config.size;
   c->assoc     = config.assoc;
   c->line_size = config.line_size;
   c->sectored  = config.sectored;

   c->sets           = (c->size / c->line_size) / c->assoc;
   c->sets_min_1     = c->sets - 1;
   c->assoc_bits     = VG_(log2)(c->assoc);
   c->line_size_bits = VG_(log2)(c->line_size);
   c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
   c->tag_mask       = ~((1<<c->tag_shift)-1);

   /* Can bits in tag entries be used for flags?
    * Should be always true as MIN_LINE_SIZE >= 16 */
   CT_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);

   if (c->assoc == 1) {
      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
		   c->size, c->line_size,
		   c->sectored ? ", sectored":"");
   } else {
      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
		   c->size, c->line_size, c->assoc,
		   c->sectored ? ", sectored":"");
   }

   c->tags = VG_(malloc)(sizeof(UInt) * c->sets * c->assoc);
   if (clo_collect_cacheuse) {
     int i;
     unsigned int start_mask, start_val;
     unsigned int end_mask, end_val;

     c->use    = VG_(malloc)(sizeof(line_use) * c->sets * c->assoc);
     c->loaded = VG_(malloc)(sizeof(line_loaded) * c->sets * c->assoc);
     c->line_start_mask = VG_(malloc)(sizeof(int) * c->line_size);
     c->line_end_mask = VG_(malloc)(sizeof(int) * c->line_size);


     c->line_size_mask = c->line_size-1;

     /* Meaning of line_start_mask/line_end_mask
      * Example: for a given cache line, you get an access starting at
      * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
      * line size of 32, you have 1 bit per byte in the mask:
      *
      *   bit31   bit8 bit5  bit 0
      *       |      |  |    |
      *       11..111111100000   line_start_mask[5]
      *       00..000111111111   line_end_mask[(5+4)-1]
      *
      *  use_mask |= line_start_mask[5] && line_end_mask[8]
      *
      */
     start_val = end_val = ~0;
     if (c->line_size < 32) {
       int bits_per_byte = 32/c->line_size;
       start_mask = (1<<bits_per_byte)-1;
       end_mask   = start_mask << (32-bits_per_byte);
       for(i=0;i<c->line_size;i++) {
	 c->line_start_mask[i] = start_val;
	 start_val  = start_val & ~start_mask;
	 start_mask = start_mask << bits_per_byte;

	 c->line_end_mask[c->line_size-i-1] = end_val;
	 end_val  = end_val & ~end_mask;
	 end_mask = end_mask >> bits_per_byte;
       }
     }
     else {
       int bytes_per_bit = c->line_size/32;
       start_mask = 1;
       end_mask   = 1 << 31;
       for(i=0;i<c->line_size;i++) {
	 c->line_start_mask[i] = start_val;
	 c->line_end_mask[c->line_size-i-1] = end_val;
	 if ( ((i+1)%bytes_per_bit) == 0) {
	   start_val   &= ~start_mask;
	   end_val     &= ~end_mask;
	   start_mask <<= 1;
	   end_mask   >>= 1;
	 }
       }
     }

     CT_DEBUG(6, "Config %s:\n", c->desc_line);
     for(i=0;i<c->line_size;i++) {
       CT_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
		i, c->line_start_mask[i], c->line_end_mask[i]);
     }

     /* We use lower tag bits as offset pointers to cache use info.
      * I.e. some cache parameters don't work.
      */
     if (c->tag_shift < c->assoc_bits) {
         VG_(message)(Vg_DebugMsg,
		      "error: Use associativity < %d for cache use statistics!",
		      (1<<c->tag_shift) );
         VG_(skin_panic)("Unsupported cache configuration");
     }
   }
   else
     c->use = 0;
   cachesim_clearcache(c);
}


#if 0
static void print_cache(cache_t2* c)
{
   UInt set, way, i;

   /* Note initialisation and update of 'i'. */
   for (i = 0, set = 0; set < c->sets; set++) {
      for (way = 0; way < c->assoc; way++, i++) {
         VG_(printf)("%8x ", c->tags[i]);
      }
      VG_(printf)("\n");
   }
}
#endif 

/* This is done as a macro rather than by passing in the cache_t2 as an 
 * arg because it slows things down by a small amount (3-5%) due to all 
 * that extra indirection. */

/* simple version without additional usage of flags per cache line */
#define CACHESIM1(FNAME, L, REF, MISS_TREATMENT, HIT_TREATMENT)             \
                                                                            \
static Int FNAME##_##L##_do##REF(Addr a, UChar size)                        \
{                                                                           \
   register UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);  \
   register UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);  \
   register UInt tag  = a >> L.tag_shift;                                   \
   int i, j;                                                                \
   Bool is_miss = False;                                                    \
   int* set;                                                                \
                                                                            \
   /* First case: word entirely within line. */                             \
   if (set1 == set2) {                                                      \
                                                                            \
      /* Shifting is a bit faster than multiplying */                       \
      set = &(L.tags[set1 << L.assoc_bits]);                                \
                                                                            \
      /* This loop is unrolled for just the first case, which is the most */\
      /* common.  We can't unroll any further because it would screw up   */\
      /* if we have a direct-mapped (1-way) cache.                        */\
      if (tag == set[0]) {                                                  \
         HIT_TREATMENT;                                                     \
      }                                                                     \
      /* If the tag is one other than the MRU, move it into the MRU spot  */\
      /* and shuffle the rest down.                                       */\
      for (i = 1; i < L.assoc; i++) {                                       \
         if (tag == set[i]) {                                               \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tag;                                                   \
            HIT_TREATMENT;                                                  \
         }                                                                  \
      }                                                                     \
                                                                            \
      /* A miss;  install this tag as MRU, shuffle rest down. */            \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag;                                                         \
      MISS_TREATMENT;                                                       \
                                                                            \
   /* Second case: word straddles two lines. */                             \
   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
   } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
      set = &(L.tags[set1 << L.assoc_bits]);                                \
      if (tag == set[0]) {                                                  \
         goto block2;                                                       \
      }                                                                     \
      for (i = 1; i < L.assoc; i++) {                                       \
         if (tag == set[i]) {                                               \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tag;                                                   \
            goto block2;                                                    \
         }                                                                  \
      }                                                                     \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag;                                                         \
      is_miss = True;                                                       \
block2:                                                                     \
      set = &(L.tags[set2 << L.assoc_bits]);                                \
      if (tag == set[0]) {                                                  \
         goto miss_treatment;                                               \
      }                                                                     \
      for (i = 1; i < L.assoc; i++) {                                       \
         if (tag == set[i]) {                                               \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tag;                                                   \
            goto miss_treatment;                                            \
         }                                                                  \
      }                                                                     \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag;                                                         \
      is_miss = True;                                                       \
miss_treatment:                                                             \
      if (is_miss) { MISS_TREATMENT; }                                      \
                                                                            \
   } else {                                                                 \
       VG_(printf)("addr: %x  size: %u  sets: %d %d", a, size, set1, set2); \
       VG_(skin_panic)("item straddles more than two cache sets");          \
   }                                                                        \
   HIT_TREATMENT;                                                           \
}

/* version with dirty-bit handling */

#define CACHESIM2(FNAME, L, REF, MISS_TREATMENT, HIT_TREATMENT, MISS_DIRTY_TREATMENT) \
									    \
static Int FNAME##_##L##_do##REF (Addr a, UChar size)			    \
{                                                                           \
   register UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);  \
   register UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);  \
   register UInt tag  = a & L.tag_mask;                                     \
   int i, j;                                                                \
   Bool is_miss = False, is_dirty = False; 			            \
   int* set;                                                                \
                                                                            \
   /* First case: word entirely within line. */                             \
   if (set1 == set2) {                                                      \
                                                                            \
      /* Shifting is a bit faster than multiplying */                       \
      set = &(L.tags[set1 << L.assoc_bits]);                                \
                                                                            \
      /* This loop is unrolled for just the first case, which is the most */\
      /* common.  We can't unroll any further because it would screw up   */\
      /* if we have a direct-mapped (1-way) cache.                        */\
      if (tag == (set[0] & ~CACHELINE_FLAGMASK)) {			    \
         HIT_TREATMENT;                                                     \
      }                                                                     \
      /* If the tag is one other than the MRU, move it into the MRU spot  */\
      /* and shuffle the rest down.                                       */\
      for (i = 1; i < L.assoc; i++) {                                       \
	if (tag == (set[i] & ~CACHELINE_FLAGMASK)) {			    \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tag | REF;                                             \
            HIT_TREATMENT;                                                  \
         }                                                                  \
      }                                                                     \
                                                                            \
      /* A miss;  install this tag as MRU, shuffle rest down. */            \
      is_dirty = set[L.assoc - 1] & CACHELINE_DIRTY;                        \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag | REF;                                                   \
      if (is_dirty) MISS_DIRTY_TREATMENT;                                   \
      MISS_TREATMENT;                                                       \
                                                                            \
   /* Second case: word straddles two lines. */                             \
   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
   } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
      set = &(L.tags[set1 << L.assoc_bits]);                                \
      if (tag == (set[0] & ~CACHELINE_FLAGMASK)) {			\
         goto block2;                                                       \
      }                                                                     \
      for (i = 1; i < L.assoc; i++) {                                       \
	if (tag == (set[i] & ~CACHELINE_FLAGMASK)) {			    \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tag | REF;                                             \
            goto block2;                                                    \
         }                                                                  \
      }                                                                     \
      is_dirty = set[L.assoc - 1] & CACHELINE_DIRTY;                        \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag | REF;                                                   \
      is_miss = True;                                                       \
block2:                                                                     \
      set = &(L.tags[set2 << L.assoc_bits]);                                \
      if (tag == (set[0] & ~CACHELINE_FLAGMASK)) {			    \
         goto miss_treatment;                                               \
      }                                                                     \
      for (i = 1; i < L.assoc; i++) {                                       \
	if (tag == (set[i] & ~CACHELINE_FLAGMASK)) {			    \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tag | REF;                                             \
            goto miss_treatment;                                            \
         }                                                                  \
      }                                                                     \
      if (!is_dirty) is_dirty = set[L.assoc - 1] & CACHELINE_DIRTY;	    \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag | REF;                                                   \
      is_miss = True;                                                       \
miss_treatment:                                                             \
      if (is_miss) {                                                        \
        if (is_dirty) MISS_DIRTY_TREATMENT;                                 \
	MISS_TREATMENT;                                                     \
      }									    \
                                                                            \
   } else {                                                                 \
       VG_(printf)("addr: %x  size: %u  sets: %d %d", a, size, set1, set2); \
       VG_(skin_panic)("item straddles more than two cache sets");          \
   }                                                                        \
   HIT_TREATMENT;                                                           \
}

CACHESIM2(cachesim, L2, Read, { return 2; }, { return 1; }, { return 3; } );
CACHESIM2(cachesim, L2, Write, { return 2; }, { return 1; }, { return 3; } );

CACHESIM1(cachesim, I1, Read,
          { return cachesim_L2_doRead(a, size); }, { return 0; } );
CACHESIM1(cachesim, D1, Read,
          { return cachesim_L2_doRead(a, size); }, { return 0; } );
CACHESIM1(cachesim_wt, D1, Write,
          { return cachesim_L2_doWrite(a, size); }, { return 0; } );
CACHESIM1(cachesim_wb, D1, Write,
	  { return cachesim_L2_doWrite(a, size); },
          { cachesim_L2_doWrite(a, size); return 0; } );


/* Prefetch simulation */

static ULong prefetch_up = 0;
static ULong prefetch_down = 0;

#define PF_STREAMS  8
#define PF_PAGEBITS 12

static UInt pf_lastblock[PF_STREAMS];
static Int  pf_seqblocks[PF_STREAMS];

static void prefetch_clear()
{
  int i;
  for(i=0;i<PF_STREAMS;i++)
    pf_lastblock[i] = pf_seqblocks[i] = 0;
}

/*
 * HW Prefetch emulation
 * Start prefetching when detecting sequential access to 3 memory blocks.
 * One stream can be detected per 4k page.
 */
static __inline__
void prefetch_L2_doref(Addr a, UChar size)
{
  UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
  UInt block = ( a >> L2.line_size_bits);

  if (block != pf_lastblock[stream]) {
    if (pf_seqblocks[stream] == 0) {
      if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
      else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
    }
    else if (pf_seqblocks[stream] >0) {
      if (pf_lastblock[stream] +1 == block) {
	pf_seqblocks[stream]++;
	if (pf_seqblocks[stream] >= 2) {
	  prefetch_up++;
	  cachesim_L2_doRead(a + 5 * L2.line_size,1);
	}
      }
      else pf_seqblocks[stream] = 0;
    }
    else if (pf_seqblocks[stream] <0) {
      if (pf_lastblock[stream] -1 == block) {
	pf_seqblocks[stream]--;
	if (pf_seqblocks[stream] <= -2) {
	  prefetch_down++;
	  cachesim_L2_doRead(a - 5 * L2.line_size,1);
	}
      }
      else pf_seqblocks[stream] = 0;
    }
    pf_lastblock[stream] = block;
  }
}  

static Int prefetch_L2_doRead(Addr a, UChar size)
{
  if (clo_simulate_hwpref) prefetch_L2_doref(a,size);
  return cachesim_L2_doRead(a, size);
}

static Int prefetch_L2_doWrite(Addr a, UChar size)
{
  if (clo_simulate_hwpref) prefetch_L2_doref(a,size);
  return cachesim_L2_doWrite(a, size);
}

CACHESIM1(prefetch, I1, Read,
	  { return prefetch_L2_doRead(a, size); }, { return 0; } );
CACHESIM1(prefetch, D1, Read,
          { return prefetch_L2_doRead(a, size); }, { return 0; } );
CACHESIM1(prefetch_wt, D1, Write,
          { return prefetch_L2_doWrite(a, size); }, { return 0; } );
CACHESIM1(prefetch_wb, D1, Write,
	  { return prefetch_L2_doWrite(a, size); },
          { cachesim_L2_doWrite(a, size); return 0; } );




/*------------------------------------------------------------*/
/*--- Cache Simulation with use metric collection          ---*/
/*------------------------------------------------------------*/

/* can not be combined with write-back or prefetch */

/* for I1/D1 caches */
#define CACHEUSE(L)                                                         \
                                                                            \
static Int cacheuse##_##L##_doRead(Addr a, UChar size)                      \
{                                                                           \
   register UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);  \
   register UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);  \
   register UInt tag  = a & L.tag_mask;                                     \
   int i, j;                                                                \
   int *set, tmp_tag, idx; 						    \
   UInt use_mask;							    \
                                                                            \
   CT_DEBUG(6,"%s.Acc(Addr %p, size %d): Sets [%d/%d]\n",                   \
	    L.name, a, size, set1, set2);				    \
                                                                            \
   /* First case: word entirely within line. */                             \
   if (set1 == set2) {                                                      \
                                                                            \
      /* Shifting is a bit faster than multiplying */                       \
      set = &(L.tags[set1 << L.assoc_bits]);                                \
      use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
                                                                            \
      /* This loop is unrolled for just the first case, which is the most */\
      /* common.  We can't unroll any further because it would screw up   */\
      /* if we have a direct-mapped (1-way) cache.                        */\
      if (tag == (set[0] & L.tag_mask)) {                                   \
        idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask);              \
        L.use[idx].count ++;                                                \
        L.use[idx].mask |= use_mask;                                        \
	CT_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
		 use_mask, L.use[idx].mask, L.use[idx].count);              \
	return 0;							    \
      }                                                                     \
      /* If the tag is one other than the MRU, move it into the MRU spot  */\
      /* and shuffle the rest down.                                       */\
      for (i = 1; i < L.assoc; i++) {                                       \
	 if (tag == (set[i] & L.tag_mask)) {			            \
  	    tmp_tag = set[i];                                               \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tmp_tag;			                            \
            idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
            L.use[idx].count ++;                                            \
            L.use[idx].mask |= use_mask;                                    \
	CT_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
		 use_mask, L.use[idx].mask, L.use[idx].count);              \
            return 0;                                                       \
         }                                                                  \
      }                                                                     \
                                                                            \
      /* A miss;  install this tag as MRU, shuffle rest down. */            \
      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag | tmp_tag;                                               \
      idx = (set1 << L.assoc_bits) | tmp_tag;                               \
      return update_##L##_use(&L, idx,         			            \
		       use_mask, a &~ L.line_size_mask);		    \
                                                                            \
   /* Second case: word straddles two lines. */                             \
   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
   } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */           \
      set = &(L.tags[set1 << L.assoc_bits]);                                \
      use_mask = L.line_start_mask[a & L.line_size_mask];		    \
      if (tag == (set[0] & L.tag_mask)) {                                   \
         idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask);             \
         L.use[idx].count ++;                                               \
         L.use[idx].mask |= use_mask;                                       \
	CT_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
		 use_mask, L.use[idx].mask, L.use[idx].count);              \
         goto block2;                                                       \
      }                                                                     \
      for (i = 1; i < L.assoc; i++) {                                       \
	 if (tag == (set[i] & L.tag_mask)) {			            \
  	    tmp_tag = set[i];                                               \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tmp_tag;                                               \
            idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
            L.use[idx].count ++;                                            \
            L.use[idx].mask |= use_mask;                                    \
	CT_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
		 use_mask, L.use[idx].mask, L.use[idx].count);              \
            goto block2;                                                    \
         }                                                                  \
      }                                                                     \
      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag | tmp_tag;                                               \
      idx = (set1 << L.assoc_bits) | tmp_tag;                               \
      miss1 = update_##L##_use(&L, idx,        			            \
		       use_mask, a &~ L.line_size_mask);		    \
block2:                                                                     \
      set = &(L.tags[set2 << L.assoc_bits]);                                \
      use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
      if (tag == (set[0] & L.tag_mask)) {                                   \
         idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask);             \
         L.use[idx].count ++;                                               \
         L.use[idx].mask |= use_mask;                                       \
	CT_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
		 use_mask, L.use[idx].mask, L.use[idx].count);              \
         return miss1;                                                      \
      }                                                                     \
      for (i = 1; i < L.assoc; i++) {                                       \
	 if (tag == (set[i] & L.tag_mask)) {			            \
  	    tmp_tag = set[i];                                               \
            for (j = i; j > 0; j--) {                                       \
               set[j] = set[j - 1];                                         \
            }                                                               \
            set[0] = tmp_tag;                                               \
            idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
            L.use[idx].count ++;                                            \
            L.use[idx].mask |= use_mask;                                    \
	CT_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
		 use_mask, L.use[idx].mask, L.use[idx].count);              \
            return miss1;                                                   \
         }                                                                  \
      }                                                                     \
      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
      for (j = L.assoc - 1; j > 0; j--) {                                   \
         set[j] = set[j - 1];                                               \
      }                                                                     \
      set[0] = tag | tmp_tag;                                               \
      idx = (set2 << L.assoc_bits) | tmp_tag;                               \
      miss2 = update_##L##_use(&L, idx,			                    \
		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
      return (miss1 | miss2);                                               \
                                                                            \
   } else {                                                                 \
       VG_(printf)("addr: %x  size: %u  sets: %d %d", a, size, set1, set2); \
       VG_(skin_panic)("item straddles more than two cache sets");          \
   }                                                                        \
   return 0;                                                                \
}


/* logarithmic bitcounting algorithm, see
 * http://graphics.stanford.edu/~seander/bithacks.html
 */
static __inline__ unsigned int countBits(unsigned int bits)
{
  unsigned int c; // store the total here
  const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
  const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};

  c = bits;
  c = ((c >> S[0]) & B[0]) + (c & B[0]);
  c = ((c >> S[1]) & B[1]) + (c & B[1]);
  c = ((c >> S[2]) & B[2]) + (c & B[2]);
  c = ((c >> S[3]) & B[3]) + (c & B[3]);
  c = ((c >> S[4]) & B[4]) + (c & B[4]);
  return c;
}

static void update_L2_use(int idx, Addr memline)
{
   line_loaded* loaded = &(L2.loaded[idx]);
   line_use* use = &(L2.use[idx]);
   int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;

   CT_DEBUG(6, " L2.miss [%d]: at %p accessing memline %p\n",
	    idx, bb_base + current_ii->instr_offset, memline);
   if (use->count>0) {
     CT_DEBUG(6, "   old: used %d, lossed bits %d (%08x) [line %p from %p]\n",
	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
     CT_DEBUG(6, "   collect: %d, use_base %p\n",
	      SK_(current_state).collect, loaded->use_base);
  }

  if (SK_(current_state).collect && loaded->use_base) {
    (loaded->use_base)[off_L2_TUseCount] += use->count;
    (loaded->use_base)[off_L2_SLossCount] += i;
  }

  use->count = 0;
  use->mask  = 0;

  loaded->memline = memline;
  loaded->iaddr   = bb_base + current_ii->instr_offset;
  loaded->use_base = (SK_(current_state).nonskipped) ?
    SK_(current_state).nonskipped->skipped :
    cost_base + current_ii->cost_offset;
}

static
Int cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
{
   UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
   int* set = &(L2.tags[setNo << L2.assoc_bits]);
   UInt tag  = memline & L2.tag_mask;

   int i, j;
   int tmp_tag, idx;
   
   CT_DEBUG(6,"L2.Acc(Memline %p): Set %d\n", memline, setNo);

   if (tag == (set[0] & L2.tag_mask)) {
     idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
     l1_loaded->dep_use = &(L2.use[idx]);

     CT_DEBUG(6," Hit0 [idx %d] (line %p from %p): => %08x, count %d\n",
		 idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
		 L2.use[idx].mask, L2.use[idx].count);
     return 1;
   }
   for (i = 1; i < L2.assoc; i++) {
     if (tag == (set[i] & L2.tag_mask)) {
       tmp_tag = set[i];
       for (j = i; j > 0; j--) {
	 set[j] = set[j - 1];
       }
       set[0] = tmp_tag;
       idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
       l1_loaded->dep_use = &(L2.use[idx]);

	CT_DEBUG(6," Hit%d [idx %d] (line %p from %p): => %08x, count %d\n",
		 i, idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
		 L2.use[idx].mask, L2.use[idx].count);
	return 1;
     }
   }

   /* A miss;  install this tag as MRU, shuffle rest down. */
   tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
   for (j = L2.assoc - 1; j > 0; j--) {
     set[j] = set[j - 1];
   }
   set[0] = tag | tmp_tag;
   idx = (setNo << L2.assoc_bits) | tmp_tag;
   l1_loaded->dep_use = &(L2.use[idx]);

   update_L2_use(idx, memline);

   return 2;
}




#define UPDATE_USE(L)					             \
                                                                     \
static Int update##_##L##_use(cache_t2* cache, int idx,              \
			       UInt mask, Addr memline)		     \
{                                                                    \
  line_loaded* loaded = &(cache->loaded[idx]);			     \
  line_use* use = &(cache->use[idx]);				     \
  int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
                                                                     \
  CT_DEBUG(6, " %s.miss [%d]: at %p accessing memline %p (mask %08x)\n", \
	   cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
  if (use->count>0) {                                                \
    CT_DEBUG(6, "   old: used %d, lossed bits %d (%08x) [line %p from %p]\n",\
	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
    CT_DEBUG(6, "   collect: %d, use_base %p\n", \
	     SK_(current_state).collect, loaded->use_base);	     \
  }                                                                  \
                                                                     \
  if (SK_(current_state).collect && loaded->use_base) {              \
    (loaded->use_base)[off_##L##_TUseCount] += use->count;           \
    (loaded->use_base)[off_##L##_SLossCount] += c;                   \
                                                                     \
    /* FIXME (?): L1/L2 line sizes must be equal ! */                \
    loaded->dep_use->mask |= use->mask;                              \
    loaded->dep_use->count += use->count;                            \
  }                                                                  \
                                                                     \
  use->count = 1;                                                    \
  use->mask  = mask;                                                 \
  loaded->memline = memline;                                         \
  loaded->iaddr   = bb_base + current_ii->instr_offset;              \
  loaded->use_base = (SK_(current_state).nonskipped) ?               \
    SK_(current_state).nonskipped->skipped :                         \
    cost_base + current_ii->cost_offset;		             \
                                                                     \
  if (memline == 0) return 1;                                        \
  return cacheuse_L2_access(memline, loaded);                        \
}

UPDATE_USE(I1);
UPDATE_USE(D1);

CACHEUSE(I1);
CACHEUSE(D1);

static __inline__ Int cacheuse_wt_D1_doWrite(Addr a, UChar size)
{
  return cacheuse_D1_doRead(a, size);
}

static void cacheuse_finish()
{
  int i;
  InstrInfo ii = { 0,0,0,0,0 };

  if (!SK_(current_state).collect) return;

  bb_base = 0;
  current_ii = &ii;
  cost_base = 0;  

  /* update usage counters */
  if (I1.use)
    for (i = 0; i < I1.sets * I1.assoc; i++)
      if (I1.loaded[i].use_base)
	update_I1_use( &I1, i, 0,0);

  if (D1.use)
    for (i = 0; i < D1.sets * D1.assoc; i++)
      if (D1.loaded[i].use_base)
	update_D1_use( &D1, i, 0,0);

  if (L2.use)
    for (i = 0; i < L2.sets * L2.assoc; i++)
      if (L2.loaded[i].use_base)
	update_L2_use(i, 0);
}
  


/*------------------------------------------------------------*/
/*--- Helper functions called by instrumented code         ---*/
/*------------------------------------------------------------*/


static __inline__
void inc_costs(int miss, ULong* c1, ULong* c2)
{
   c1[0]++;
   c2[0]++;
   if (miss == 0) return;

   c1[1]++;
   c2[1]++;
   if (miss == 1) return;

   c1[2]++;
   c2[2]++;
   if (miss == 2) return;

   if (clo_simulate_writeback) {
     c1[3]++;
     c2[3]++;
   }

   CT_ASSERT(miss==3);
}


static __inline__
void finish_log_0D(InstrInfo* ii, Int missIr)
{
   CT_DEBUG(6, "log_0D:  Ir=%p/%u => Ir %d\n",
	    bb_base + ii->instr_offset, ii->instr_size, missIr);

   if (SK_(current_state).collect) {
     ULong* cost_Ir;

     if (SK_(current_state).nonskipped)
       cost_Ir = SK_(current_state).nonskipped->skipped + SK_(sets).off_full_Ir;
     else
       cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;

     inc_costs(missIr, cost_Ir, 
	       SK_(current_state).cost + SK_(sets).off_full_Ir );
   }
   VGP_POPCC(VgpCacheSimulate);
}

#define LOG0D(FNAME)			       \
                                               \
__attribute__ ((regparm (1)))                  \
static void FNAME##_log_0D(InstrInfo* ii)      \
{                                              \
  Int missIr;                                  \
                                               \
  VGP_PUSHCC(VgpCacheSimulate);                \
  current_ii = ii;                             \
  missIr = FNAME##_I1_doRead(bb_base +         \
                             ii->instr_offset, \
			     ii->instr_size);  \
  finish_log_0D(ii, missIr);                   \
}

LOG0D(cachesim);
LOG0D(prefetch);
LOG0D(cacheuse);

/* Instruction doing a read access */

static __inline__
void finish_log_1Dr(InstrInfo* ii, Addr data, Int missIr, Int missDr)
{
  CT_DEBUG(6, "log_1Dr: Ir=%p/%u, Dr=%p/%u => Ir %d, Dr %d\n",
	   bb_base + ii->instr_offset, ii->instr_size,
	   data, ii->data_size, missIr, missDr);

  if (SK_(current_state).collect) {
    ULong *cost_Ir, *cost_Dr;

    if (SK_(current_state).nonskipped) {
      cost_Ir = SK_(current_state).nonskipped->skipped + SK_(sets).off_full_Ir;
      cost_Dr = SK_(current_state).nonskipped->skipped + SK_(sets).off_full_Dr;
    }
    else {
      cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
      cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
    }
       
    inc_costs(missIr, cost_Ir, 
	      SK_(current_state).cost + SK_(sets).off_full_Ir );
    inc_costs(missDr, cost_Dr,
	      SK_(current_state).cost + SK_(sets).off_full_Dr );

    if (SK_(clo).collect_data)
      SK_(handle_read)(data, ii->data_size, missDr);
  }
  VGP_POPCC(VgpCacheSimulate);
}

#define LOG1DR(FNAME)                                         \
                                                              \
__attribute__ ((regparm (2)))                                 \
static void FNAME##_log_1Dr(InstrInfo* ii, Addr data)         \
{                                                             \
  Int missIr, missDr;                                         \
                                                              \
  VGP_PUSHCC(VgpCacheSimulate);                               \
  current_ii = ii;                                            \
                                                              \
  missIr = FNAME##_I1_doRead(bb_base + ii->instr_offset,      \
			     ii->instr_size);                 \
  missDr = FNAME##_D1_doRead(data, ii->data_size);            \
  finish_log_1Dr(ii, data, missIr, missDr);                   \
}

LOG1DR(cachesim);
LOG1DR(prefetch);
LOG1DR(cacheuse);

/* Instruction doing a write access */

static __inline__
void finish_log_1Dw(InstrInfo* ii, Addr data, Int missIr, Int missDw)
{
  CT_DEBUG(6, "log_1Dw: Ir=%p/%u, Dw=%p/%u => Ir %d, Dw %d\n",
	   bb_base + ii->instr_offset, ii->instr_size,
	   data, ii->data_size, missIr, missDw);

  if (SK_(current_state).collect) {
    ULong *cost_Ir, *cost_Dw;

    if (SK_(current_state).nonskipped) {
      cost_Ir = SK_(current_state).nonskipped->skipped + SK_(sets).off_sim_Ir;
      cost_Dw = SK_(current_state).nonskipped->skipped + SK_(sets).off_sim_Dw;
    }
    else {
      cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
      cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
    }
       
    inc_costs(missIr, cost_Ir,
	      SK_(current_state).cost + SK_(sets).off_full_Ir );
    inc_costs(missDw, cost_Dw,
	      SK_(current_state).cost + SK_(sets).off_full_Dw );

    if (SK_(clo).collect_data)
      SK_(handle_write)(data, ii->data_size, missDw);
  }

  VGP_POPCC(VgpCacheSimulate);
}

#define LOG1DW(FNAME,WPOLICY)                                 \
                                                              \
__attribute__ ((regparm (2)))                                 \
static void FNAME##_##WPOLICY##_log_1Dw(InstrInfo* ii, Addr data)\
{                                                             \
  Int missIr, missDw;                                         \
                                                              \
  VGP_PUSHCC(VgpCacheSimulate);                               \
  current_ii = ii;                                            \
                                                              \
  missIr = FNAME##_I1_doRead(bb_base + ii->instr_offset,      \
			     ii->instr_size);                 \
  missDw = FNAME##_##WPOLICY##_D1_doWrite(data, ii->data_size); \
  finish_log_1Dw(ii, data, missIr, missDw);                   \
}

LOG1DW(cachesim,wt);
LOG1DW(cachesim,wb);
LOG1DW(prefetch,wt);
LOG1DW(prefetch,wb);
LOG1DW(cacheuse,wt);

/* Instruction doing a read and a write access */

static
void finish_log_2D(InstrInfo* ii, Addr data1, Addr data2,
		   Int missIr, Int missDr, Int missDw)
{
  CT_DEBUG(6,
	   "log_2D: Ir=%p/%u, Dr=%p/%u, Dw=%p/%u => Ir %d, Dr %d, Dw %d\n",
	   bb_base + ii->instr_offset, ii->instr_size,
	   data1, ii->data_size, data2, ii->data_size, missIr, missDr, missDw);

  if (SK_(current_state).collect) {
    ULong *cost_Ir, *cost_Dr, *cost_Dw;

    if (SK_(current_state).nonskipped) {
      cost_Ir = SK_(current_state).nonskipped->skipped + SK_(sets).off_sim_Ir;
      cost_Dr = SK_(current_state).nonskipped->skipped + SK_(sets).off_sim_Dr;
      cost_Dw = SK_(current_state).nonskipped->skipped + SK_(sets).off_sim_Dw;
    }
    else {
      cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
      cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
      cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
    }
       
    inc_costs(missIr, cost_Ir, 
	      SK_(current_state).cost + SK_(sets).off_full_Ir );
    inc_costs(missDr, cost_Dr, 
	      SK_(current_state).cost + SK_(sets).off_full_Dr );
    inc_costs(missDw, cost_Dw, 
	      SK_(current_state).cost + SK_(sets).off_full_Dw );

    if (SK_(clo).collect_data) {
      SK_(handle_read)(data1, ii->data_size, missDr);
      SK_(handle_write)(data2, ii->data_size, missDw);
    }
  }
  VGP_POPCC(VgpCacheSimulate);
}

#define LOG2D(FNAME,WPOLICY)                                  \
                                                              \
__attribute__ ((regparm (3)))                                 \
static void FNAME##_##WPOLICY##_log_2D(InstrInfo* ii, Addr data1, Addr data2)\
{                                                             \
  Int missIr, missDr, missDw;                                 \
                                                              \
  VGP_PUSHCC(VgpCacheSimulate);                               \
  current_ii = ii;                                            \
                                                              \
  missIr = FNAME##_I1_doRead(bb_base + ii->instr_offset,      \
			     ii->instr_size);                 \
  missDr = FNAME##_D1_doRead(data1, ii->data_size);           \
  missDw = FNAME##_##WPOLICY##_D1_doWrite(data2, ii->data_size); \
  finish_log_2D(ii, data1, data2, missIr, missDr, missDw);    \
}

LOG2D(cachesim,wt);
LOG2D(cachesim,wb);
LOG2D(prefetch,wt);
LOG2D(prefetch,wb);
LOG2D(cacheuse,wt);

/*------------------------------------------------------------*/
/*--- Automagic cache initialisation stuff                 ---*/
/*------------------------------------------------------------*/

#define UNDEFINED_CACHE     ((cache_t) { -1, -1, -1, False }) 

static cache_t clo_I1_cache = UNDEFINED_CACHE;
static cache_t clo_D1_cache = UNDEFINED_CACHE;
static cache_t clo_L2_cache = UNDEFINED_CACHE;

/* All CPUID info taken from sandpile.org/a32/cpuid.htm */
/* Probably only works for Intel and AMD chips, and probably only for some of
 * them. 
 */

static __inline__ void cpuid(Int n, UInt *a, UInt *b, UInt *c, UInt *d)
{
   __asm__ __volatile__ (
    "cpuid"
    : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)      /* output */
    : "0" (n)         /* input */
    );
}

static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
{
    VG_(message)(Vg_DebugMsg, 
       "warning: Pentium with %d K micro-op instruction trace cache", 
       actual_size);
    VG_(message)(Vg_DebugMsg, 
       "         Simulating a %d KB cache with %d B lines", 
       used_size, line_size);
}

/* Intel method is truly wretched.  We have to do an insane indexing into an
 * array of pre-defined configurations for various parts of the memory
 * hierarchy. 
 */
static
Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
{
   UChar info[16];
   Int   i, trials;
   Bool  L2_found = False;

   if (level < 2) {
      VG_(message)(Vg_DebugMsg, 
         "warning: CPUID level < 2 for Intel processor (%d)", 
         level);
      return -1;
   }

   cpuid(2, (Int*)&info[0], (Int*)&info[4], 
            (Int*)&info[8], (Int*)&info[12]);
   trials  = info[0] - 1;   /* AL register - bits 0..7 of %eax */
   info[0] = 0x0;           /* reset AL */

   if (0 != trials) {
      VG_(message)(Vg_DebugMsg, 
         "warning: non-zero CPUID trials for Intel processor (%d)",
         trials);
      return -1;
   }

   for (i = 0; i < 16; i++) {

      switch (info[i]) {

      case 0x0:       /* ignore zeros */
          break;
          
      /* TLB info, ignore */
      case 0x01: case 0x02: case 0x03: case 0x04:
      case 0x50: case 0x51: case 0x52: case 0x5b: case 0x5c: case 0x5d:
      case 0xb0: case 0xb3:
          break;      

      case 0x06: *I1c = (cache_t) {  8, 4, 32 }; break;
      case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break;
      case 0x30: *I1c = (cache_t) { 32, 8, 64 }; break;

      case 0x0a: *D1c = (cache_t) {  8, 2, 32, False }; break;
      case 0x0c: *D1c = (cache_t) { 16, 4, 32, False }; break;
      case 0x2c: *D1c = (cache_t) {  32, 8, 64, False }; break;

      /* IA-64 info -- panic! */
      case 0x10: case 0x15: case 0x1a: 
      case 0x88: case 0x89: case 0x8a: case 0x8d:
      case 0x90: case 0x96: case 0x9b:
         VG_(message)(Vg_DebugMsg,
            "error: IA-64 cache stats!  Cachegrind doesn't run on IA-64...");
         VG_(skin_panic)("IA-64 detected");

      case 0x22: case 0x23: case 0x25: case 0x29: 
          VG_(message)(Vg_DebugMsg, 
             "warning: L3 cache detected but ignored\n");
          break;

      /* These are sectored, whatever that means */
      case 0x39: *L2c = (cache_t) {  128, 4, 64, True }; L2_found = True; break;
      case 0x3c: *L2c = (cache_t) {  256, 4, 64, True }; L2_found = True; break;

      /* If a P6 core, this means "no L2 cache".  
         If a P4 core, this means "no L3 cache".
         We don't know what core it is, so don't issue a warning.  To detect
         a missing L2 cache, we use 'L2_found'. */
      case 0x40:
          break;

      case 0x41: *L2c = (cache_t) {  128, 4, 32, False }; L2_found = True; break;
      case 0x42: *L2c = (cache_t) {  256, 4, 32, False }; L2_found = True; break;
      case 0x43: *L2c = (cache_t) {  512, 4, 32, False }; L2_found = True; break;
      case 0x44: *L2c = (cache_t) { 1024, 4, 32, False }; L2_found = True; break;
      case 0x45: *L2c = (cache_t) { 2048, 4, 32, False }; L2_found = True; break;

      /* These are sectored, whatever that means */
      case 0x66: *D1c = (cache_t) {  8, 4, 64, True };  break;      /* sectored */
      case 0x67: *D1c = (cache_t) { 16, 4, 64, True };  break;      /* sectored */
      case 0x68: *D1c = (cache_t) { 32, 4, 64, True };  break;      /* sectored */

      /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
       * conversion to byte size is a total guess;  treat the 12K and 16K
       * cases the same since the cache byte size must be a power of two for
       * everything to work!.  Also guessing 32 bytes for the line size... 
       */
      case 0x70:    /* 12K micro-ops, 8-way */
         *I1c = (cache_t) { 16, 8, 32 };  
         micro_ops_warn(12, 16, 32);
         break;  
      case 0x71:    /* 16K micro-ops, 8-way */
         *I1c = (cache_t) { 16, 8, 32 };  
         micro_ops_warn(16, 16, 32); 
         break;  
      case 0x72:    /* 32K micro-ops, 8-way */
         *I1c = (cache_t) { 32, 8, 32 };  
         micro_ops_warn(32, 32, 32); 
         break;  

      /* These are sectored, whatever that means */
      case 0x79: *L2c = (cache_t) {  128, 8,  64, True }; L2_found = True;  break;
      case 0x7a: *L2c = (cache_t) {  256, 8,  64, True }; L2_found = True;  break;
      case 0x7b: *L2c = (cache_t) {  512, 8,  64, True }; L2_found = True;  break;
      case 0x7c: *L2c = (cache_t) { 1024, 8,  64, True }; L2_found = True;  break;
      case 0x7e: *L2c = (cache_t) {  256, 8, 128, True }; L2_found = True;  break;

      case 0x81: *L2c = (cache_t) {  128, 8, 32, True };  L2_found = True;  break;
      case 0x82: *L2c = (cache_t) {  256, 8, 32, True };  L2_found = True;  break;
      case 0x83: *L2c = (cache_t) {  512, 8, 32, True };  L2_found = True;  break;
      case 0x84: *L2c = (cache_t) { 1024, 8, 32, True };  L2_found = True;  break;
      case 0x85: *L2c = (cache_t) { 2048, 8, 32, True };  L2_found = True;  break;
      case 0x86: *L2c = (cache_t) {  512, 4, 64, True };  L2_found = True;  break;
      case 0x87: *L2c = (cache_t) { 1024, 8, 64, True };  L2_found = True;  break;

      default:
          VG_(message)(Vg_DebugMsg, 
             "warning: Unknown Intel cache config value "
             "(0x%x), ignoring", info[i]);
          break;
      }
   }

   if (!L2_found)
      VG_(message)(Vg_DebugMsg, 
         "warning: L2 cache not installed, ignore L2 results.");

   return 0;
}

/* AMD method is straightforward, just extract appropriate bits from the
 * result registers.
 *
 * Bits, for D1 and I1:
 *  31..24  data L1 cache size in KBs    
 *  23..16  data L1 cache associativity (FFh=full)    
 *  15.. 8  data L1 cache lines per tag    
 *   7.. 0  data L1 cache line size in bytes
 *
 * Bits, for L2:
 *  31..16  unified L2 cache size in KBs
 *  15..12  unified L2 cache associativity (0=off, FFh=full)
 *  11.. 8  unified L2 cache lines per tag    
 *   7.. 0  unified L2 cache line size in bytes
 *
 * #3  The AMD K7 processor's L2 cache must be configured prior to relying 
 *     upon this information. (Whatever that means -- njn)
 *
 * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model
 * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
 * so we detect that.
 * 
 * Returns 0 on success, non-zero on failure.
 */
static
Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
{
   UInt ext_level;
   Int dummy, model;
   Int I1i, D1i, L2i;
   
   cpuid(0x80000000, &ext_level, &dummy, &dummy, &dummy);

   if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) {
      VG_(message)(Vg_UserMsg, 
         "warning: ext_level < 0x80000006 for AMD processor (0x%x)", 
         ext_level);
      return -1;
   }

   cpuid(0x80000005, &dummy, &dummy, &D1i, &I1i);
   cpuid(0x80000006, &dummy, &dummy, &L2i, &dummy);

   cpuid(0x1, &model, &dummy, &dummy, &dummy);
   /*VG_(message)(Vg_UserMsg,"CPU model %04x",model);*/

   /* Check for Duron bug */
   if (model == 0x630) {
      VG_(message)(Vg_UserMsg,
         "Buggy Duron stepping A0. Assuming L2 size=65536 bytes");
      L2i = (64 << 16) | (L2i & 0xffff);
   }

   D1c->size      = (D1i >> 24) & 0xff;
   D1c->assoc     = (D1i >> 16) & 0xff;
   D1c->line_size = (D1i >>  0) & 0xff;

   I1c->size      = (I1i >> 24) & 0xff;
   I1c->assoc     = (I1i >> 16) & 0xff;
   I1c->line_size = (I1i >>  0) & 0xff;

   L2c->size      = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
   L2c->assoc     = (L2i >> 12) & 0xf;
   L2c->line_size = (L2i >>  0) & 0xff;

   return 0;
}

#if VG_CORE_INTERFACE_MAJOR_VERSION < 7
/* older VG (<2.3) does not support VG_(has_cpuid),
 * thus we need to install a signal handler for SIGILL
 */ 
static jmp_buf cpuid_jmpbuf;

static
void cpuid_SIGILL_handler(int signum)
{
   __builtin_longjmp(cpuid_jmpbuf, 1);
}
#endif

static 
Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
{
   Int  level, ret;
   Char vendor_id[13];

#if VG_CORE_INTERFACE_MAJOR_VERSION < 7
   vki_ksigaction sigill_new, sigill_saved;
   Int res;

  /* Install own SIGILL handler */
   sigill_new.ksa_handler  = cpuid_SIGILL_handler;
   sigill_new.ksa_flags    = 0;
   sigill_new.ksa_restorer = NULL;
   res = VG_(ksigemptyset)( &sigill_new.ksa_mask );
   sk_assert(res == 0);

   res = VG_(ksigaction)( VKI_SIGILL, &sigill_new, &sigill_saved );
   sk_assert(res == 0);

   /* Trap for illegal instruction, in case it's a really old processor that
    * doesn't support CPUID. */
   if (__builtin_setjmp(cpuid_jmpbuf) == 0) {
      cpuid(0, &level, (int*)&vendor_id[0], 
                       (int*)&vendor_id[8], (int*)&vendor_id[4]);    
      vendor_id[12] = '\0';

      /* Restore old SIGILL handler */
      res = VG_(ksigaction)( VKI_SIGILL, &sigill_saved, NULL );
      CT_ASSERT(res == 0);

   } else  {
      VG_(message)(Vg_DebugMsg, "CPUID instruction not supported");

      /* Restore old SIGILL handler */
      res = VG_(ksigaction)( VKI_SIGILL, &sigill_saved, NULL );
      CT_ASSERT(res == 0);
      return -1;
   }
#else
   if (!VG_(has_cpuid)()) {
      VG_(message)(Vg_DebugMsg, "CPUID instruction not supported");
      return -1;
   }
   VG_(cpuid)(0, &level, (int*)&vendor_id[0],
              (int*)&vendor_id[8], (int*)&vendor_id[4]);
   vendor_id[12] = '\0';
#endif

   if (0 == level) {
      VG_(message)(Vg_DebugMsg, "CPUID level is 0, early Pentium?\n");
      return -1;
   }

   /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
   if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
      ret = Intel_cache_info(level, I1c, D1c, L2c);

   } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
      ret = AMD_cache_info(I1c, D1c, L2c);

   } else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) {
      /* Total kludge.  Pretend to be a VIA Nehemiah. */
      D1c->size      = 64;
      D1c->assoc     = 16;
      D1c->line_size = 16;
      I1c->size      = 64;
      I1c->assoc     = 4;
      I1c->line_size = 16;
      L2c->size      = 64;
      L2c->assoc     = 16;
      L2c->line_size = 16;
      ret = 0;

   } else {
      VG_(message)(Vg_DebugMsg, "CPU vendor ID not recognised (%s)",
                   vendor_id);
      return -1;
   }

   /* Successful!  Convert sizes from KB to bytes */
   I1c->size *= 1024;
   D1c->size *= 1024;
   L2c->size *= 1024;
      
   return ret;
}

/* Checks cache config is ok;  makes it so if not. */
static 
void check_cache(cache_t* cache, cache_t* dflt, Char *name)
{
   /* First check they're all powers of two */
   if (-1 == VG_(log2)(cache->size)) {
      VG_(message)(Vg_UserMsg,
         "warning: %s size of %dB not a power of two; "
         "defaulting to %dB", name, cache->size, dflt->size);
      cache->size = dflt->size;
   }

   if (-1 == VG_(log2)(cache->assoc)) {
      VG_(message)(Vg_UserMsg,
         "warning: %s associativity of %d not a power of two; "
         "defaulting to %d-way", name, cache->assoc, dflt->assoc);
      cache->assoc = dflt->assoc;
   }

   if (-1 == VG_(log2)(cache->line_size)) {
      VG_(message)(Vg_UserMsg,
         "warning: %s line size of %dB not a power of two; "
         "defaulting to %dB", 
         name, cache->line_size, dflt->line_size);
      cache->line_size = dflt->line_size;
   }

   /* Then check line size >= 16 -- any smaller and a single instruction could
    * straddle three cache lines, which breaks a simulation assertion and is
    * stupid anyway. */
   if (cache->line_size < MIN_LINE_SIZE) {
      VG_(message)(Vg_UserMsg,
         "warning: %s line size of %dB too small; "
         "increasing to %dB", name, cache->line_size, MIN_LINE_SIZE);
      cache->line_size = MIN_LINE_SIZE;
   }

   /* Then check cache size > line size (causes seg faults if not). */
   if (cache->size <= cache->line_size) {
      VG_(message)(Vg_UserMsg,
         "warning: %s cache size of %dB <= line size of %dB; "
         "increasing to %dB", name, cache->size, cache->line_size,
                              cache->line_size * 2);
      cache->size = cache->line_size * 2;
   }

   /* Then check assoc <= (size / line size) (seg faults otherwise). */
   if (cache->assoc > (cache->size / cache->line_size)) {
      VG_(message)(Vg_UserMsg,
         "warning: %s associativity > (size / line size); "
         "increasing size to %dB", 
            name, cache->assoc * cache->line_size);
      cache->size = cache->assoc * cache->line_size;
   }
}

/* On entry, args are undefined.  Fill them with any info from the
 * command-line, then fill in any remaining with CPUID instruction if possible,
 * otherwise use defaults.  Then check them and fix if not ok. */
static 
void get_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
{
   /* Defaults are for a model 3 or 4 Athlon */
   cache_t I1_dflt = (cache_t) {  65536, 2, 64, False };
   cache_t D1_dflt = (cache_t) {  65536, 2, 64, False  };
   cache_t L2_dflt = (cache_t) { 262144, 8, 64, False  };

#define CMD_LINE_DEFINED(L)            \
   (-1 != clo_##L##_cache.size  ||     \
    -1 != clo_##L##_cache.assoc ||     \
    -1 != clo_##L##_cache.line_size)

   *I1c = clo_I1_cache;
   *D1c = clo_D1_cache;
   *L2c = clo_L2_cache;

   /* If any undefined on command-line, try CPUID */
   if (! CMD_LINE_DEFINED(I1) ||
       ! CMD_LINE_DEFINED(D1) ||
       ! CMD_LINE_DEFINED(L2)) { 

      /* Overwrite CPUID result for any cache defined on command-line */
      if (0 == get_caches_from_CPUID(I1c, D1c, L2c)) {
   
         if (CMD_LINE_DEFINED(I1)) *I1c = clo_I1_cache;
         if (CMD_LINE_DEFINED(D1)) *D1c = clo_D1_cache;
         if (CMD_LINE_DEFINED(L2)) *L2c = clo_L2_cache;

      /* CPUID failed, use defaults for each undefined by command-line */
      } else {
         VG_(message)(Vg_DebugMsg, 
                      "Couldn't detect cache configuration, using one "
                      "or more defaults ");

         *I1c = (CMD_LINE_DEFINED(I1) ? clo_I1_cache : I1_dflt);
         *D1c = (CMD_LINE_DEFINED(D1) ? clo_D1_cache : D1_dflt);
         *L2c = (CMD_LINE_DEFINED(L2) ? clo_L2_cache : L2_dflt);
      }
   }
#undef CMD_LINE_DEFINED

   check_cache(I1c, &I1_dflt, "I1");
   check_cache(D1c, &D1_dflt, "D1");
   check_cache(L2c, &L2_dflt, "L2");

   if (VG_(clo_verbosity) > 1) {
      VG_(message)(Vg_UserMsg, "Cache configuration used:");
      VG_(message)(Vg_UserMsg, "  I1: %dB, %d-way, %dB lines",
		   I1c->size, I1c->assoc, I1c->line_size);
      VG_(message)(Vg_UserMsg, "  D1: %dB, %d-way, %dB lines%s",
		   D1c->size, D1c->assoc, D1c->line_size,
		   D1c->sectored ? ", sectored":"");
      VG_(message)(Vg_UserMsg, "  L2: %dB, %d-way, %dB lines%s",
		   L2c->size, L2c->assoc, L2c->line_size,
		   L2c->sectored ? ", sectored":"");
   }
}

/* Handlers for pure data access collect */

__attribute__ ((regparm (2)))
static void data_log_1Dr(InstrInfo* ii, Addr data)
{
  SK_(handle_read)(data, ii->data_size, 0);
}

__attribute__ ((regparm (2)))
static void data_log_1Dw(InstrInfo* ii, Addr data)
{
  SK_(handle_write)(data, ii->data_size, 0);
}

__attribute__ ((regparm (3)))
  static void data_log_2D(InstrInfo* ii, Addr data1, Addr data2)
{
  SK_(handle_read)(data1, ii->data_size, 0);
  SK_(handle_write)(data2, ii->data_size, 0);
}


/* Initialize and clear simulator state */
static void cachesim_post_clo_init(void)
{
  /* Cache configurations. */
  cache_t  I1c, D1c, L2c;

  get_caches(&I1c, &D1c, &L2c);
  
  I1.name = "I1";
  D1.name = "D1";
  L2.name = "L2";

  cachesim_initcache(I1c, &I1);
  cachesim_initcache(D1c, &D1);
  cachesim_initcache(L2c, &L2);

  /* Initialize access handlers */
  if (!clo_simulate_cache) {
    SK_(cachesim).log_0D  = 0;

    if (SK_(clo).collect_data) {
      SK_(cachesim).log_1Dr = data_log_1Dr;
      SK_(cachesim).log_1Dw = data_log_1Dw;
      SK_(cachesim).log_2D  = data_log_2D;
    }
    else {
      SK_(cachesim).log_1Dr = 0;
      SK_(cachesim).log_1Dw = 0;
      SK_(cachesim).log_2D  = 0;
    }
    return;
  }

  if (clo_collect_cacheuse) {
    SK_(cachesim).log_0D  = cacheuse_log_0D;
    SK_(cachesim).log_1Dr = cacheuse_log_1Dr;
    SK_(cachesim).log_1Dw = cacheuse_wt_log_1Dw;
    SK_(cachesim).log_2D  = cacheuse_wt_log_2D;
    return;
  }

  if (clo_simulate_hwpref) {
    prefetch_clear();

    SK_(cachesim).log_0D  = prefetch_log_0D;
    SK_(cachesim).log_1Dr = prefetch_log_1Dr;
    SK_(cachesim).log_1Dw = clo_simulate_writeback ?
                            prefetch_wb_log_1Dw : prefetch_wt_log_1Dw;
    SK_(cachesim).log_2D  = clo_simulate_writeback ?
                            prefetch_wb_log_2D : prefetch_wt_log_2D;
  }
  else {
    SK_(cachesim).log_0D  = cachesim_log_0D;
    SK_(cachesim).log_1Dr = cachesim_log_1Dr;
    SK_(cachesim).log_1Dw = clo_simulate_writeback ?
                            cachesim_wb_log_1Dw : cachesim_wt_log_1Dw;
    SK_(cachesim).log_2D  = clo_simulate_writeback ?
                            cachesim_wb_log_2D : cachesim_wt_log_2D;
  }
}

/* Clear simulator state. Has to be initialized before */
static void cachesim_clear()
{
  cachesim_clearcache(&I1);
  cachesim_clearcache(&D1);
  cachesim_clearcache(&L2);

  prefetch_clear();
}


static void cachesim_getdesc(Char* buf)
{
  Int p;
  p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
  p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
  VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
}

static void cachesim_print_opts()
{
  VG_(printf)(
"\n   cache simulator options:\n"
"    --simulate-cache=no|yes   Do cache simulation [no]\n"
"    --simulate-wb=no|yes      Count write-back events [no]\n"
"    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
#if CT_EXPERIMENTAL
"    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
#endif
"    --cacheuse=no|yes         Collect cache block use [no]\n"
"    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
"    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
"    --L2=<size>,<assoc>,<line_size>  set L2 cache manually\n"
	      );
}

static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
{
   int   i1, i2, i3;
   int   i;
   char *opt = VG_(strdup)(orig_opt);

   i = i1 = opt_len;

   /* Option looks like "--I1=65536,2,64".
    * Find commas, replace with NULs to make three independent 
    * strings, then extract numbers.  Yuck. */
   while (VG_(isdigit)(opt[i])) i++;
   if (',' == opt[i]) {
      opt[i++] = '\0';
      i2 = i;
   } else goto bad;
   while (VG_(isdigit)(opt[i])) i++;
   if (',' == opt[i]) {
      opt[i++] = '\0';
      i3 = i;
   } else goto bad;
   while (VG_(isdigit)(opt[i])) i++;
   if ('\0' != opt[i]) goto bad;

   cache->size      = (Int)VG_(atoll)(opt + i1);
   cache->assoc     = (Int)VG_(atoll)(opt + i2);
   cache->line_size = (Int)VG_(atoll)(opt + i3);

   VG_(free)(opt);

   return;

  bad:
   VG_(bad_option)(orig_opt);
}

/* Check for command line option for cache configuration.
 * Return False if unknown and not handled.
 *
 * Called from SK_(process_cmd_line_option)() in clo.c
 */
static Bool cachesim_parse_opt(Char* arg)
{
  if (0 == VG_(strcmp)(arg, "--simulate-cache=yes"))
    clo_simulate_cache = True;
  else if (0 == VG_(strcmp)(arg, "--simulate-cache=no"))
    clo_simulate_cache = False;

  else if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
    clo_simulate_writeback = True;
  else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
    clo_simulate_writeback = False;

  else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
    clo_simulate_hwpref = True;
  else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
    clo_simulate_hwpref = False;

  else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
    clo_simulate_sectors = True;
  else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
    clo_simulate_sectors = False;

  else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
    clo_collect_cacheuse = True;
    clo_simulate_cache = True;
    /* Use counters only make sense with fine dumping */
    SK_(clo).dump_instr = True;
  }
  else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
    clo_collect_cacheuse = False;

  /* 5 is length of "--I1=" */
  else if (0 == VG_(strncmp)(arg, "--I1=", 5))
    parse_opt(&clo_I1_cache, arg,   5);
  else if (0 == VG_(strncmp)(arg, "--D1=", 5))
    parse_opt(&clo_D1_cache, arg,   5);
  else if (0 == VG_(strncmp)(arg, "--L2=", 5))
    parse_opt(&clo_L2_cache, arg,   5);
  else
    return False;

  return True;
}

/* Adds commas to ULong, right justifying in a field field_width wide, returns
 * the string in buf. */
static
Int commify(ULong n, int field_width, char* buf)
{
   int len, n_commas, i, j, new_len, space;

   VG_(sprintf)(buf, "%llu", n);
   len = VG_(strlen)(buf);
   n_commas = (len - 1) / 3;
   new_len = len + n_commas;
   space = field_width - new_len;

   /* Allow for printing a number in a field_width smaller than it's size */
   if (space < 0) space = 0;    

   /* Make j = -1 because we copy the '\0' before doing the numbers in groups
    * of three. */
   for (j = -1, i = len ; i >= 0; i--) {
      buf[i + n_commas + space] = buf[i];

      if ((i>0) && (3 == ++j)) {
         j = 0;
         n_commas--;
         buf[i + n_commas + space] = ',';
      }
   }
   /* Right justify in field. */
   for (i = 0; i < space; i++)  buf[i] = ' ';
   return new_len;
}

static
void percentify(Int n, Int ex, Int field_width, char buf[]) 
{
   int i, len, space;
    
   VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
   len = VG_(strlen)(buf);
   space = field_width - len;
   if (space < 0) space = 0;     /* Allow for v. small field_width */
   i = len;

   /* Right justify in field */
   for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
   for (i = 0; i < space; i++)  buf[i] = ' ';
}

static void cachesim_printstat()
{
  FullCost total = SK_(total_cost), D_total = 0;
  ULong L2_total_m, L2_total_mr, L2_total_mw,
    L2_total, L2_total_r, L2_total_w;
  char buf1[RESULTS_BUF_LEN], 
    buf2[RESULTS_BUF_LEN], 
    buf3[RESULTS_BUF_LEN];
  Int l1, l2, l3;
  Int p;

  if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
    VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu", 
		 prefetch_up);
    VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu", 
		 prefetch_down);
    VG_(message)(Vg_DebugMsg, "");
  }

  /* I cache results.  Use the I_refs value to determine the first column
   * width. */
  l1 = commify(total[SK_(sets).off_full_Ir], 0, buf1);
  VG_(message)(Vg_UserMsg, "I   refs:      %s", buf1);

  if (!clo_simulate_cache) return;

  commify(total[SK_(sets).off_full_Ir +1], l1, buf1);
  VG_(message)(Vg_UserMsg, "I1  misses:    %s", buf1);

  commify(total[SK_(sets).off_full_Ir +2], l1, buf1);
  VG_(message)(Vg_UserMsg, "L2i misses:    %s", buf1);

  p = 100;

  if (0 == total[SK_(sets).off_full_Ir]) 
    total[SK_(sets).off_full_Ir] = 1;

  percentify(total[SK_(sets).off_full_Ir+1] * 100 * p /
	     total[SK_(sets).off_full_Ir], p, l1+1, buf1);
  VG_(message)(Vg_UserMsg, "I1  miss rate: %s", buf1);
       
  percentify(total[SK_(sets).off_full_Ir+2] * 100 * p /
	     total[SK_(sets).off_full_Ir], p, l1+1, buf1);
  VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
  VG_(message)(Vg_UserMsg, "");
   
  /* D cache results.
     Use the D_refs.rd and D_refs.wr values to determine the
   * width of columns 2 & 3. */

  D_total = SK_(get_eventset_cost)( SK_(sets).full );
  SK_(init_cost)( SK_(sets).full, D_total);
  SK_(copy_cost)( SK_(sets).Dr, D_total, total + SK_(sets).off_full_Dr );
  SK_(add_cost) ( SK_(sets).Dw, D_total, total + SK_(sets).off_full_Dw );

  commify( D_total[0], l1, buf1);
  l2 = commify(total[SK_(sets).off_full_Dr], 0,  buf2);
  l3 = commify(total[SK_(sets).off_full_Dw], 0,  buf3);
  VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)",
	       buf1,  buf2,  buf3);

  commify( D_total[1], l1, buf1);
  commify(total[SK_(sets).off_full_Dr+1], l2, buf2);
  commify(total[SK_(sets).off_full_Dw+1], l3, buf3);
  VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)",
	       buf1, buf2, buf3);

  commify( D_total[2], l1, buf1);
  commify(total[SK_(sets).off_full_Dr+2], l2, buf2);
  commify(total[SK_(sets).off_full_Dw+2], l3, buf3);
  VG_(message)(Vg_UserMsg, "L2d misses:    %s  (%s rd + %s wr)",
	       buf1, buf2, buf3);

  p = 10;
  
  if (0 == D_total[0])   D_total[0] = 1;
  if (0 == total[SK_(sets).off_full_Dr]) total[SK_(sets).off_full_Dr] = 1;
  if (0 == total[SK_(sets).off_full_Dw]) total[SK_(sets).off_full_Dw] = 1;
  
  percentify( D_total[1] * 100 * p / D_total[0],  p, l1+1, buf1);
  percentify(total[SK_(sets).off_full_Dr+1] * 100 * p /
	     total[SK_(sets).off_full_Dr], p, l2+1, buf2);
  percentify(total[SK_(sets).off_full_Dw+1] * 100 * p /
	     total[SK_(sets).off_full_Dw], p, l3+1, buf3);
  VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
  
  percentify( D_total[2] * 100 * p / D_total[0],  p, l1+1, buf1);
  percentify(total[SK_(sets).off_full_Dr+2] * 100 * p /
	     total[SK_(sets).off_full_Dr], p, l2+1, buf2);
  percentify(total[SK_(sets).off_full_Dw+2] * 100 * p /
	     total[SK_(sets).off_full_Dw], p, l3+1, buf3);
  VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
  VG_(message)(Vg_UserMsg, "");


  
  /* L2 overall results */
  
  L2_total   =
    total[SK_(sets).off_full_Dr +1] +
    total[SK_(sets).off_full_Dw +1] +
    total[SK_(sets).off_full_Ir +1];
  L2_total_r =
    total[SK_(sets).off_full_Dr +1] +
    total[SK_(sets).off_full_Ir +1];
  L2_total_w = total[SK_(sets).off_full_Dw +1];
  commify(L2_total,   l1, buf1);
  commify(L2_total_r, l2, buf2);
  commify(L2_total_w, l3, buf3);
  VG_(message)(Vg_UserMsg, "L2 refs:       %s  (%s rd + %s wr)",
	       buf1, buf2, buf3);
  
  L2_total_m  =
    total[SK_(sets).off_full_Dr +2] +
    total[SK_(sets).off_full_Dw +2] +
    total[SK_(sets).off_full_Ir +2];
  L2_total_mr =
    total[SK_(sets).off_full_Dr +2] +
    total[SK_(sets).off_full_Ir +2];
  L2_total_mw = total[SK_(sets).off_full_Dw +2];
  commify(L2_total_m,  l1, buf1);
  commify(L2_total_mr, l2, buf2);
  commify(L2_total_mw, l3, buf3);
  VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)",
	       buf1, buf2, buf3);
  
  percentify(L2_total_m  * 100 * p /
	     (total[SK_(sets).off_full_Ir] + D_total[0]),  p, l1+1, buf1);
  percentify(L2_total_mr * 100 * p /
	     (total[SK_(sets).off_full_Ir] + total[SK_(sets).off_full_Dr]),
	     p, l2+1, buf2);
  percentify(L2_total_mw * 100 * p /
	     total[SK_(sets).off_full_Dw], p, l3+1, buf3);
  VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )",
	       buf1, buf2,buf3);
}


/*------------------------------------------------------------*/
/*--- Setup for Event set.                                 ---*/
/*------------------------------------------------------------*/

struct event_sets SK_(sets);

void SK_(init_eventsets)(Int max_user)
{
  EventType * e1, *e2, *e3, *e4;
  EventSet *Ir, *Dr, *Dw;
  EventSet *D0, *D1r, *D1w, *D2;
  EventSet *sim, *full;
  EventSet *use;
  int sizeOfUseIr;

  use = SK_(get_eventset)("Use", 4);
  if (clo_collect_cacheuse) {
    /* if TUse is 0, there was never a load, and no loss, too */
    e1 = SK_(register_eventtype)("TUse1");
    e2 = SK_(register_eventtype)("SLoss1");
    SK_(add_dep_event2)(use, e1, e2);
    e1 = SK_(register_eventtype)("TUse2");
    e2 = SK_(register_eventtype)("SLoss2");
    SK_(add_dep_event2)(use, e1, e2);
  }

  Ir = SK_(get_eventset)("Ir", 4);    
  Dr = SK_(get_eventset)("Dr", 4);
  Dw = SK_(get_eventset)("Dw", 4);
  if (clo_simulate_cache) {
    e1 = SK_(register_eventtype)("Ir");
    e2 = SK_(register_eventtype)("I1mr");
    e3 = SK_(register_eventtype)("I2mr");
    if (clo_simulate_writeback) {
      e4 = SK_(register_eventtype)("I2dmr");
      SK_(add_dep_event4)(Ir, e1,e2,e3,e4);
    }
    else
      SK_(add_dep_event3)(Ir, e1,e2,e3);

    e1 = SK_(register_eventtype)("Dr");
    e2 = SK_(register_eventtype)("D1mr");
    e3 = SK_(register_eventtype)("D2mr");
    if (clo_simulate_writeback) {
      e4 = SK_(register_eventtype)("D2dmr");
      SK_(add_dep_event4)(Dr, e1,e2,e3,e4);
    }
    else
      SK_(add_dep_event3)(Dr, e1,e2,e3);
    
    e1 = SK_(register_eventtype)("Dw");
    e2 = SK_(register_eventtype)("D1mw");
    e3 = SK_(register_eventtype)("D2mw");
    if (clo_simulate_writeback) {
      e4 = SK_(register_eventtype)("D2dmw");
      SK_(add_dep_event4)(Dw, e1,e2,e3,e4);
    }
    else
      SK_(add_dep_event3)(Dw, e1,e2,e3);

  }
  else {
    e1 = SK_(register_eventtype)("Ir");
    SK_(add_eventtype)(Ir, e1);
  }

  sizeOfUseIr =  use->size + Ir->size;
  D0 = SK_(get_eventset)("D0", sizeOfUseIr);
  SK_(add_eventset)(D0, use);
  off_D0_Ir  = SK_(add_eventset)(D0, Ir);

  D1r = SK_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
  SK_(add_eventset)(D1r, use);
  off_D1r_Ir = SK_(add_eventset)(D1r, Ir);
  off_D1r_Dr = SK_(add_eventset)(D1r, Dr);

  D1w = SK_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
  SK_(add_eventset)(D1w, use);
  off_D1w_Ir   = SK_(add_eventset)(D1w, Ir);
  off_D1w_Dw   = SK_(add_eventset)(D1w, Dw);

  D2  = SK_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
  SK_(add_eventset)(D2, use);
  off_D2_Ir    = SK_(add_eventset)(D2, Ir);
  off_D2_Dr    = SK_(add_eventset)(D2, Dr);
  off_D2_Dw    = SK_(add_eventset)(D2, Dw);

  sim = SK_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
  SK_(add_eventset)(sim, use);
  SK_(sets).off_sim_Ir   = SK_(add_eventset)(sim, Ir);
  SK_(sets).off_sim_Dr   = SK_(add_eventset)(sim, Dr);
  SK_(sets).off_sim_Dw   = SK_(add_eventset)(sim, Dw);

  if (SK_(clo).collect_alloc) {
    max_user += 2;
    if (SK_(clo).collect_data) max_user += 2;
  }
  if (SK_(clo).collect_systime) max_user += 2;

  full = SK_(get_eventset)("full", sim->size + max_user);
  SK_(add_eventset)(full, sim);
  SK_(sets).off_full_Ir   = SK_(sets).off_sim_Ir;
  SK_(sets).off_full_Dr   = SK_(sets).off_sim_Dr;
  SK_(sets).off_full_Dw   = SK_(sets).off_sim_Dw;

  SK_(sets).use = use;
  SK_(sets).Ir  = Ir;
  SK_(sets).Dr  = Dr;
  SK_(sets).Dw  = Dw;

  SK_(sets).D0  = D0;
  SK_(sets).D1r = D1r;
  SK_(sets).D1w = D1w;
  SK_(sets).D2  = D2;

  SK_(sets).sim  = sim;
  SK_(sets).full = full;

  if (SK_(clo).collect_alloc) {
    e1 = SK_(register_eventtype)("allocCount");
    e2 = SK_(register_eventtype)("allocSize");
    SK_(sets).off_full_user =  SK_(add_dep_event2)(full, e1,e2);

    if (SK_(clo).collect_data) {
      e1 = SK_(register_eventtype)("freeCount");
      e2 = SK_(register_eventtype)("freeSize");
      SK_(add_dep_event2)(full, e1,e2);
    }
  }

  if (SK_(clo).collect_systime) {
    e1 = SK_(register_eventtype)("sysCount");
    e2 = SK_(register_eventtype)("sysTime");
    SK_(sets).off_full_systime =  SK_(add_dep_event2)(full, e1,e2);
  }

  CT_DEBUGIF(1) {
    CT_DEBUG(1, "EventSets:\n");
    SK_(print_eventset)(-2, use);
    SK_(print_eventset)(-2, Ir);
    SK_(print_eventset)(-2, Dr);
    SK_(print_eventset)(-2, Dw);
    SK_(print_eventset)(-2, sim);
    SK_(print_eventset)(-2, full);
  }

  /* Not-existing events are silently ignored */
  SK_(dumpmap) = SK_(get_eventmapping)(full);
  SK_(append_event)(SK_(dumpmap), "Ir");
  SK_(append_event)(SK_(dumpmap), "Dr");
  SK_(append_event)(SK_(dumpmap), "Dw");
  SK_(append_event)(SK_(dumpmap), "I1mr");
  SK_(append_event)(SK_(dumpmap), "D1mr");
  SK_(append_event)(SK_(dumpmap), "D1mw");
  SK_(append_event)(SK_(dumpmap), "I2mr");
  SK_(append_event)(SK_(dumpmap), "D2mr");
  SK_(append_event)(SK_(dumpmap), "D2mw");
  SK_(append_event)(SK_(dumpmap), "I2dmr");
  SK_(append_event)(SK_(dumpmap), "D2dmr");
  SK_(append_event)(SK_(dumpmap), "D2dmw");
  SK_(append_event)(SK_(dumpmap), "TUse1");
  SK_(append_event)(SK_(dumpmap), "SLoss1");
  SK_(append_event)(SK_(dumpmap), "TUse2");
  SK_(append_event)(SK_(dumpmap), "SLoss2");
  SK_(append_event)(SK_(dumpmap), "allocCount");
  SK_(append_event)(SK_(dumpmap), "allocSize");
  SK_(append_event)(SK_(dumpmap), "freeCount");
  SK_(append_event)(SK_(dumpmap), "freeSize");  
  SK_(append_event)(SK_(dumpmap), "sysCount");
  SK_(append_event)(SK_(dumpmap), "sysTime");

}



static
void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
{
  /* if eventset use is defined, it is always first (hardcoded!) */
  SK_(add_and_zero_cost)( SK_(sets).use, dst, cost);  

  /* FIXME: This is hardcoded... */
  if (es == SK_(sets).D0) {
    SK_(add_and_zero_cost)( SK_(sets).Ir, dst + SK_(sets).off_sim_Ir,
			    cost + off_D0_Ir);
  }
  else if (es == SK_(sets).D1r) {
    SK_(add_and_zero_cost)( SK_(sets).Ir, dst + SK_(sets).off_sim_Ir,
			    cost + off_D1r_Ir);
    SK_(add_and_zero_cost)( SK_(sets).Dr, dst + SK_(sets).off_sim_Dr,
			    cost + off_D1r_Dr);
  }
  else if (es == SK_(sets).D1w) {
    SK_(add_and_zero_cost)( SK_(sets).Ir, dst + SK_(sets).off_sim_Ir,
			    cost + off_D1w_Ir);
    SK_(add_and_zero_cost)( SK_(sets).Dw, dst + SK_(sets).off_sim_Dw,
			    cost + off_D1w_Dw);
  }
  else {
    CT_ASSERT(es == SK_(sets).D2);
    SK_(add_and_zero_cost)( SK_(sets).Ir, dst + SK_(sets).off_sim_Ir,
			    cost + off_D2_Ir);
    SK_(add_and_zero_cost)( SK_(sets).Dr, dst + SK_(sets).off_sim_Dr,
			    cost + off_D2_Dr);
    SK_(add_and_zero_cost)( SK_(sets).Dw, dst + SK_(sets).off_sim_Dw,
			    cost + off_D2_Dw);
  }
}

/* this is called at dump time for every instruction executed */
static void cachesim_add_icost(SimCost cost, BBCC* bbcc, InstrInfo* ii)
{
  /* FIXME: Is this too simple ? */
  if (!clo_simulate_cache)
    cost[SK_(sets).off_sim_Ir] += bbcc->exe_counter;
  else
    add_and_zero_Dx(ii->eventset, cost, 
		    bbcc->cost + ii->cost_offset);
}

static void cachesim_after_bbsetup()
{
  BBCC* bbcc = SK_(current_state).bbcc;

  if (clo_simulate_cache) {
    BB* bb = bbcc->bb;

    /* only needed if log_* functions are called */
    bb_base   = bb->obj->offset + bb->offset;
    cost_base = bbcc->cost;

  }
  else if (SK_(current_state).collect) {
    /* even with skipping the log_ calls, we have to increment
     * the global counter */
    SK_(current_state).cost[SK_(sets).off_sim_Ir]
      += bbcc->bb->instr_count;
  }
}

static void cachesim_finish()
{
  if (clo_collect_cacheuse)
    cacheuse_finish();
}

/*------------------------------------------------------------*/
/*--- The simulator defined in this file                   ---*/
/*------------------------------------------------------------*/

struct cachesim_if SK_(cachesim) = {
  .print_opts    = cachesim_print_opts,
  .parse_opt     = cachesim_parse_opt,
  .post_clo_init = cachesim_post_clo_init,
  .clear         = cachesim_clear,
  .getdesc       = cachesim_getdesc,
  .printstat     = cachesim_printstat,
  .add_icost     = cachesim_add_icost,
  .after_bbsetup = cachesim_after_bbsetup,
  .finish        = cachesim_finish,

  /* these will be set by cachesim_post_clo_init */
  .log_0D        = 0,
  .log_1Dr       = 0,
  .log_1Dw       = 0,
  .log_2D        = 0
};


/*--------------------------------------------------------------------*/
/*--- end                                                 ct_sim.c ---*/
/*--------------------------------------------------------------------*/

