/* This file is part of q-tools, a collection of performance tools
   Copyright (c) 2003-2006 Hewlett-Packard Development Company, L.P.
   Contributed by David Mosberger-Tang <davidm@hpl.hp.com>
   Contributed by Stephane Eranian <eranian@hpl.hp.com>

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330,
   Boston, MA  02111-1307  USA  */

#include <sys/types.h>
#include <inttypes.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "q-syscollect.h"
#include "call-counts.h"

#ifdef HAVE_EXPLICIT_PERFMON3
# include <perfmon3/pfmlib_itanium2.h>
#else
# include <perfmon/pfmlib_itanium2.h>
#endif


#define BTB_REGS_MASK		(  REG_MASK (8)  | REG_MASK (9)		\
				 | REG_MASK (10) | REG_MASK (11)	\
				 | REG_MASK (12) | REG_MASK (13)	\
				 | REG_MASK (14) | REG_MASK (15)	\
				 | REG_MASK (16))


static inline uint64_t
get_btb_ip (int regnum, pfm_ita2_pmd_reg_t reg, pfm_ita2_pmd_reg_t pmd16)
{
  /* We can ignore the slot-number here because functions must start
     and end at a bundle-boundary.  */
  uint64_t ip = (reg.pmd8_15_ita2_reg.btb_addr << 4);

  if (pmd16.pmd_val & (1UL << (4 + 4*regnum)))
    ip += 0x10;

  return ip;
}

static uint64_t
get_ita2_ip_from_btb (pfm_dfl_smpl_entry_t *ent, char **posp)
{
  unsigned long reg, num_regs, i, j, bbi, src_addr, dst_addr;
  pfm_ita2_pmd_reg_t *pmd, src, dst, pmd16;

  pmd = (pfm_ita2_pmd_reg_t *) *posp;
  *posp = (char *) &pmd[9];

  /* find the most recently recorded branch in the BTB: */
  pmd16 = pmd[8];
  bbi = pmd16.pmd16_ita2_reg.btbi_bbi;

  if (pmd16.pmd16_ita2_reg.btbi_full)
    num_regs = 8;
  else
    num_regs = bbi + 1;
  i = (bbi + 7) % 8;
  for (reg = 0; reg < num_regs; ++reg)
    {
      src = pmd[i];
      if (src.pmd8_15_ita2_reg.btb_b)
	{
	  /* Found the register recording the most-recent branch
	     instruction.  */

	  /* As a special case, if the most recent branch was a taken
	     branch from the last slot in a bundle to the next bundle,
	     we treat it as an "rfi" marker and ignore it.  */
	  if (src.pmd8_15_ita2_reg.btb_slot == 2 && reg > 0)
	    {
	      j = (i + 1) & 7;
	      dst = pmd[j];
	      src_addr = get_btb_ip (i, src, pmd16);
	      dst_addr = get_btb_ip (j, dst, pmd16);
	      if (dst_addr - src_addr == 0x10)
		return ent->ip | 0xd;
	    }

	  if (src.pmd8_15_ita2_reg.btb_slot != 3 && reg > 0)
	    {
	      /* the branch was taken and we have a record of the
		 branch destination; record that instead */
	      i = (i + 1) & 7;
	      src = pmd[i];
	    }
	  break;
	}
      i = (i + 7) % 8;
    }
  if (reg >= num_regs)
    {
      /* BTB didn't record any branches!  Use IP from interrupt
	 instead and mark bits 0-3 with "0xc" so we can distinguish
	 the sample from BTB-samples. */
      return ent->ip | 0xc;	/* or in with special marker... */
    }
  return get_btb_ip (i, src, pmd16);
}

static char *
process_ita2_btb_sample (struct addr_space *as,
		    pfm_dfl_smpl_entry_t *ent, char *pos)
{
  pfm_ita2_pmd_reg_t pmd16;
  uint64_t j, src_ip, dst_ip, last, interval;
  pfm_ita2_pmd_reg_t *reg, src, dst;
  unsigned long weight;
  int n;

  reg = (pfm_ita2_pmd_reg_t *) pos;

  interval = -ent->last_reset_val;

  pmd16 = reg[8];

  last = pmd16.pmd16_ita2_reg.btbi_bbi;
  j = pmd16.pmd16_ita2_reg.btbi_full ? pmd16.pmd16_ita2_reg.btbi_bbi : 0;

#if 0
  printf ("first=%lu, last=%lu, pmd16=%lx, interval=%ld\n",
	  j, last, pmd16.pmd_val, interval);
  {
    int k;
    for (k = 0; k < 9; ++k)
      printf ("\treg[%d] = %016lx\n", k, reg[k].pmd_val);
  }
#endif

  /* count the number of branches recorded in the BTB: */
  n = 0;
  do
    {
      src = reg[j];
      dst = reg[(j + 1) & 7];

      if (src.pmd8_15_ds_ita2_reg.btb_b)
	{
	  ++n;
	  if (!dst.pmd8_15_ds_ita2_reg.btb_b)
	    j = (j + 1) & 7;
	  if (j == last)
	    break;
	}
      j = (j + 1) & 7;
    }
  while (j != last);

  j =  pmd16.pmd16_ita2_reg.btbi_full ? pmd16.pmd16_ita2_reg.btbi_bbi : 0;

  if (n > 0)
    {
#if 0
      n = 1;
#endif
      weight = (2*interval + n) / (2*n);

      while (n-- > 0)
	{
	  src = reg[j];
	  dst = reg[(j + 1) & 7];

	  /* Ensure "src" recorded a branch instruction (btb_b set to
	     0), not a branch target.  */
	  if (src.pmd8_15_ds_ita2_reg.btb_b)
	    {
	      src_ip = get_btb_ip (j, src, pmd16);
	      dst_ip = get_btb_ip ((j + 1) & 7, dst, pmd16);

	      /* Note that we're capturing br.ret, so a return from
		 SRC to DST gets count as a call from DST to
		 SRC...  */
	      call_count_add (as->cc, dst_ip, src_ip, weight);
	      check_addr_space_mapping (as, src_ip);
	      check_addr_space_mapping (as, dst_ip);

	      /* If destination address was recorded by a branch
		 target entry (btb_b set to 0), skip over that entry
		 as well.  */
	      if (!dst.pmd8_15_ds_ita2_reg.btb_b)
		j = (j + 1) & 7;
	    }
	  j = (j + 1) & 7;
	}
    }
  return (char *) &reg[9];	/* XXX must be 16-byte aligned??? */
}

static void
setup_ita2_call_count_sampling (pfarg_pmc_t *pc, int *num_pcsp,
			   pfarg_pmd_t *pd, int *num_pdsp,
			   unsigned int plm, pfarg_pmd_t **b_pd)
{
  int ret, num_pcs = *num_pcsp, num_pds = *num_pdsp;
  pfmlib_ita2_input_param_t i2param;
  pfmlib_input_param_t iparam;
  pfmlib_output_param_t oparam;
  unsigned long reset_value;
  unsigned long i;

  memset (&iparam, 0, sizeof (iparam));
  memset (&i2param, 0, sizeof (i2param));

  i2param.pfp_ita2_btb.btb_used = 1;
  i2param.pfp_ita2_btb.btb_tm  = 0x2;	/* capture taken-branches only */
  i2param.pfp_ita2_btb.btb_ptm = 0x3;	/* capture regardless of tgt pred. */
  i2param.pfp_ita2_btb.btb_ppm = 0x3;	/* capture regardless of path pred. */
  i2param.pfp_ita2_btb.btb_brt = 0x2;	/* capture only return branches */
  i2param.pfp_ita2_btb.btb_plm = plm;

  iparam.pfp_dfl_plm = plm;
  iparam.pfp_event_count = 1;
  /* for system-wide monitoring we must use privileged monitors: */
  iparam.pfp_flags = PFMLIB_PFP_SYSTEMWIDE;

  if (pfm_find_event_byname ("BRANCH_EVENT", &iparam.pfp_events[0].event)
      != PFMLIB_SUCCESS)
    panic ("pfm_find_event_byname: failed to find BRANCH_EVENT\n");

  memset (&oparam, 0, sizeof (oparam));
  ret = pfm_dispatch_events (&iparam, &i2param, &oparam, NULL);
  if (ret != PFMLIB_SUCCESS)
    panic ("pfm_dispatch_events(): %s\n", pfm_strerror (ret));

  /* Now setup the PMC and PMD descriptors: */

  for (i = 0; i < oparam.pfp_pmc_count; ++i)
    {
      pc[num_pcs + i].reg_num = oparam.pfp_pmcs[i].reg_num;
      pc[num_pcs + i].reg_value = oparam.pfp_pmcs[i].reg_value;
    }

  /* set the PMD reg # for BRANCH_EVENT */
  pd[num_pds].reg_num = pc[num_pcs].reg_num;
  *b_pd = pd + num_pds;

  /* Specify what happens when the BRANCH_EVENT counter wraps-around: */

  pd[num_pds].reg_smpl_pmds[0] = BTB_REGS_MASK;
  pd[num_pds].reg_flags |= PFM_REGFL_OVFL_NOTIFY; /* notify on BRANCH_EVENT */
  pd[num_pds].reg_flags |= PFM_REGFL_RANDOM;	/* randomize the interval */
  /* clear BTB index (PMD16):  */
  pd[num_pds].reg_reset_pmds[0] = REG_MASK (16);

  reset_value = -100000;
  pd[num_pds].reg_value = reset_value;		/* initial value */
  pd[num_pds].reg_long_reset = reset_value;	/* min-long-interval */
  pd[num_pds].reg_short_reset = reset_value;	/* min-short-interval */
  pd[num_pds].reg_random_seed = 0xc0ffee;	/* seed */
  pd[num_pds].reg_random_mask = 0x3ff;		/* mask */

  /* Define the reset value for PMD16: */
  pd[num_pds + 1].reg_num = 16;
  pd[num_pds + 1].reg_value = 0;
  pd[num_pds + 1].reg_flags = 0;
  pd[num_pds + 1].reg_long_reset = 0;
  pd[num_pds + 1].reg_short_reset = 0;

  /* Commit the new pc/pd structures: */
  *num_pcsp += oparam.pfp_pmc_count;
  *num_pdsp += 2;
}
	
static void
setup_ita2_btb_code_sampling (pfarg_pmc_t *pc, int *num_pcsp,
			 pfarg_pmd_t *pd, int *num_pdsp,
			 unsigned int plm, pfarg_pmd_t **cs_pd)
{
  int ret, num_pcs = *num_pcsp, num_pds = *num_pdsp;
  pfmlib_ita2_input_param_t i2param;
  pfmlib_input_param_t iparam;
  pfmlib_output_param_t oparam;
  unsigned long reset_value;
  unsigned long i;

  memset (&iparam, 0, sizeof (iparam));
  memset (&i2param, 0, sizeof (i2param));

  i2param.pfp_ita2_btb.btb_used = 1;
  i2param.pfp_ita2_btb.btb_tm  = 0x3;	/* capture regardless whether taken */
  i2param.pfp_ita2_btb.btb_ptm = 0x3;	/* capture regardless of tgt pred. */
  i2param.pfp_ita2_btb.btb_ppm = 0x3;	/* capture regardless of path pred. */
  i2param.pfp_ita2_btb.btb_brt = 0x0;	/* capture all branch-types */
  i2param.pfp_ita2_btb.btb_plm = plm;

  iparam.pfp_dfl_plm = plm;
  iparam.pfp_event_count = 1;
  /* for system-wide monitoring we must use privileged monitors: */
  iparam.pfp_flags = PFMLIB_PFP_SYSTEMWIDE;

  iparam.pfp_events[0].event = cs_event_code;

  memset (&oparam, 0, sizeof (oparam));
  ret = pfm_dispatch_events (&iparam, &i2param, &oparam, NULL);
  if (ret != PFMLIB_SUCCESS)
    panic ("pfm_dispatch_events(): %s\n", pfm_strerror (ret));

  /* Now setup the PMC and PMD descriptors: */

  for (i = 0; i < oparam.pfp_pmc_count; ++i)
    {
      pc[num_pcs + i].reg_num = oparam.pfp_pmcs[i].reg_num;
      pc[num_pcs + i].reg_value = oparam.pfp_pmcs[i].reg_value;
    }

  /* set the PMD reg # for code-sampling event */
  pd[num_pds].reg_num = pc[num_pcs].reg_num;
  *cs_pd = pd + num_pds;

  /* Specify what happens when the code sampling event-counter wraps-around: */

  pd[num_pds].reg_smpl_pmds[0] = BTB_REGS_MASK;
  pd[num_pds].reg_flags |= PFM_REGFL_OVFL_NOTIFY; /* notify on overflow */
  pd[num_pds].reg_flags |= PFM_REGFL_RANDOM;	/* randomize the interval */
  /* clear BTB index (PMD16):  */
  pd[num_pds].reg_reset_pmds[0] = REG_MASK (16);

  if (cs_event_code == cs_cycle_code)
    reset_value = - (long) (cycle_frequency / code_sample_rate);
  else
    reset_value = -100000;	/* your guess is as good as mine... */

  pd[num_pds].reg_value = reset_value;		/* initial value */
  pd[num_pds].reg_long_reset = reset_value;	/* min-long-interval */
  pd[num_pds].reg_short_reset = reset_value;	/* min-short-interval */
  pd[num_pds].reg_random_seed = 0xc0ffee;	/* seed */
  pd[num_pds].reg_random_mask = 0x3ff;		/* mask */

  /* Define the reset value for PMD16: */
  pd[num_pds + 1].reg_num = 16;
  pd[num_pds + 1].reg_value = 0;
  pd[num_pds + 1].reg_flags = 0;
  pd[num_pds + 1].reg_long_reset = 0;
  pd[num_pds + 1].reg_short_reset = 0;

  /* Commit the new pc/pd structures: */
  *num_pcsp += oparam.pfp_pmc_count;
  *num_pdsp += 2;
}

inline static void
dump_btb (pfm_ita2_pmd_reg_t *pmd)
{
  unsigned long reg, i, bbi, num_regs;
  pfm_ita2_pmd_reg_t pmd16;

  pmd16 = pmd[8];
  bbi = pmd16.pmd16_ita2_reg.btbi_bbi;

  if (pmd16.pmd16_ita2_reg.btbi_full)
    num_regs = 8;
  else
    num_regs = bbi + 1;
  i = (bbi + 7) % 8;
  printf ("--- bbi=%lu\n", bbi);
  for (reg = 0; reg < num_regs; ++reg)
    {
      printf ("BTB[%lu] = %016lx\n", i, *(unsigned long *)&pmd[i]);
      i = (i + 7) % 8;
    }
}

struct qsys_btb_support qsys_itanium2 =
  {
    .pmu_type = PFMLIB_ITANIUM2_PMU,
    .setup_btb_code_sampling = setup_ita2_btb_code_sampling,
    .setup_call_count_sampling = setup_ita2_call_count_sampling,
    .process_btb_sample = process_ita2_btb_sample,
    .get_ip_from_btb = get_ita2_ip_from_btb
  };
