/* This file is part of q-tools, a collection of performance tools
   Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
   Contributed by David Mosberger-Tang <davidm@hpl.hp.com>

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330,
   Boston, MA  02111-1307  USA  */

#include <elf.h>
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <inttypes.h>
#include <limits.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
#include <unistd.h>

#ifdef HAVE_EXPLICIT_PERFMON3
# include <perfmon3/perfmon.h>
# include <perfmon3/perfmon_default_smpl.h>
# include <perfmon3/pfmlib.h>
# include <perfmon3/pfmlib_itanium2.h>
#else
# include <perfmon/perfmon.h>
# include <perfmon/perfmon_default_smpl.h>
# include <perfmon/pfmlib.h>
# include <perfmon/pfmlib_itanium2.h>
#endif

#include <sys/poll.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>

#include <q-lib.h>

#include "q-syscollect.h"
#include "call-counts.h"
#include "time-hist.h"

#define NELEMS(a)		((sizeof (a) / sizeof (a[0])))
#define REG_MASK(regnum)	(1UL << (regnum))

#define KB			(1 << 10)
#define SAMPLING_BUFFER_SIZE	(256*KB)
#define BTB_REGS_MASK		(  REG_MASK (8)  | REG_MASK (9)		\
				 | REG_MASK (10) | REG_MASK (11)	\
				 | REG_MASK (12) | REG_MASK (13)	\
				 | REG_MASK (14) | REG_MASK (15)	\
				 | REG_MASK (16))
#define BTB_LEN			4	/* McKinley-specific */

/* It is currently not possible to add events incrementally.  Because
   of that, we need a gross hack in setup_code_sampling() so libpfm
   knows that PMD4 is already taken for BRANCH_EVENT when we dispatch
   the code-sampling event (normally CPU_CYCLES).  */
#define GROSS_HACK	1

#define MIN_POLL_TIMEOUT	 100	/* 100 milliseconds */
#define MAX_POLL_TIMEOUT	1000	/* 1 second */

static struct option long_opts[] =
  {
    { .name = "help",
		.has_arg = no_argument, .flag = NULL, .val = 'h' },
    { .name = "call-sample-rate",
		.has_arg = required_argument, .flag = NULL, .val = 'c' },
    { .name = "code-sample-rate",
		.has_arg = required_argument, .flag = NULL, .val = 'C' },
    { .name = "code-sample-event",
		.has_arg = required_argument, .flag = NULL, .val = 'e' },
    { .name = "monitor-interruptions",
		.has_arg = no_argument,	      .flag = NULL, .val = 'i' },
    { .name = "monitor-kernel",
		.has_arg = no_argument,	      .flag = NULL, .val = 'k' },
    { .name = "merge-unknown",
		.has_arg = optional_argument, .flag = NULL, .val = 'm' },
    { .name = "sample-duration",
		.has_arg = required_argument, .flag = NULL, .val = 't' },
    { .name = "monitor-user",
		.has_arg = no_argument,	      .flag = NULL, .val = 'u' },
    { .name = "verbose",
		.has_arg = no_argument,	      .flag = NULL, .val = 'v' },
    { .name = NULL }
  };

const char *prog_name;
const char **prog_argv;

struct mapping
  {
    struct mapping *next;	/* must be first member in this structure! */
    unsigned long start;
    unsigned long size;
    unsigned long offset;
    const char *path;
  };

struct addr_space
  {
    struct addr_space *next;
    pid_t pid;
    pid_t tgid;		/* thread-group id (defaults to same as pid) */
    const char *name;
    const char **argv;
    struct time_hist *hist;
    struct call_count *cc;
    struct mapping *mappings;
  };

static int verbose;
static pid_t *child_pid;
static struct addr_space *addr_space_list;
static int monitor_interruptions;
static int monitor_user;
static int monitor_kernel;
static int merge_unknown_addr_spaces = 1;
static int poll_timeout = MIN_POLL_TIMEOUT;
static int num_cpus, my_cpu = -1;
/* PMD register number that's couting BRANCH_EVENT: */
static int branch_event_pmd = -1;
/* ditto for code-sampling: */
static int cs_event_pmd = -1;
static int duration;
static double code_sample_rate = 1000.0;
static double call_sample_rate = 1000.0;
static const char *cs_event_name = "CPU_CYCLES";
static int num_pds;
static pfarg_reg_t pd[PFMLIB_MAX_PMDS];
static pfarg_reg_t *branch_event_pd, *cs_pd;
static struct timeval now, last_time;
static sig_atomic_t done;
static unsigned long cycle_frequency;
static unsigned long cs_interval_sum, cs_num_samples;
static pfm_default_smpl_hdr_t *sampling_buffer;
static int sampling_fd;

/* Find last bit set.  Returns -1 for 0.  */
static inline long
fls (unsigned long val)
{
  unsigned long bits, n = (8 * sizeof (val)) / 2;
  long pos = n;

  if (!val)
    return -1;

  while ((bits = (val >> pos)) != 1)
    {
      n = (n + 1) / 2;
      if (bits > 1)
	pos += n;
      else
	pos -= n;
    }
  return pos;
}

#ifndef SYS_sched_setaffinity
# define SYS_sched_setaffinity 1231
#endif

/* Some versions of libc declare sched_setaffinity() with two args,
   others with 3 args.  Circumvent the madness via syscall()...  */

int
my_sched_setaffinity (pid_t pid, size_t len, cpu_set_t *mask)
{
  return syscall (SYS_sched_setaffinity, pid, len, mask);
}

static struct addr_space *
addr_space_create (pid_t pid, pid_t tgid, const char *name, const char **argv)
{
  struct addr_space *as = malloc (sizeof (*as));

  if (!as)
    return NULL;

  memset (as, 0, sizeof (*as));
  as->next = addr_space_list;
  as->pid = pid;
  as->tgid = tgid;
  addr_space_list = as;
  as->hist = time_hist_create ();
  if (!as->hist)
    {
      free (as);
      return NULL;
    }
  as->cc = call_count_create ();
  if (!as->cc)
    {
      time_hist_destroy (as->hist);
      free (as);
      return NULL;
    }
  as->name = strdup (name);
  as->argv = argv;
  return as;
}

static struct addr_space *
get_addr_space (pid_t pid)
{
  static struct addr_space *kernel_addr_space;
  static struct addr_space *unknown_addr_space;
  static struct mapping entire_addr_space;
  char *line = NULL, *cp, name[PATH_MAX], exe_name[1024];
  const char **argv = NULL;
  struct addr_space *as;
  size_t line_size = 0;
  pid_t tgid = pid;
  FILE *fp;
  int i;

  if (!monitor_user)
    {
      if (!kernel_addr_space)
	{
	  snprintf (name, sizeof (name), "kernel-cpu%d", my_cpu);
	  kernel_addr_space = addr_space_create (0, 0, name, NULL);
	}
      return kernel_addr_space;
    }

  for (as = addr_space_list; as; as = as->next)
    if (as->pid == pid)
      return as;

  /* fetch the command-line: */
  if (pid > 0)
    {
      snprintf (name, sizeof (name), "/proc/%d/cmdline", pid);
      fp = fopen (name, "r");
      if (fp)
	{
	  struct arg
	    {
	      struct arg *next;
	      char *str;
	    }
	  *a, *a_end = NULL, *arg_list = NULL, *a_next;
	  int argc = 0;

	  while (!feof (fp))
	    if (getdelim (&line, &line_size, '\0', fp) >= 0)
	      {
		a = malloc (sizeof (*a));
		if (a)
		  {
		    ++argc;
		    a->str = strdup (line);
		    a->next = 0;
		    if (a_end)
		      {
			a_end->next = a;
			a_end = a;
		      }
		    else
		      arg_list = a_end = a;
		  }
	      }

	  argv = malloc ((argc + 1) * sizeof (argv[0]));
	  for (i = 0, a = arg_list; i < argc; ++i, a = a_next)
	    {
	      if (argv)
		argv[i] = a->str;
	      else
		free (a->str);
	      a_next = a->next;
	      free (a);
	    }
	  argv[i] = NULL;
	  fclose (fp);
	}
    }

  /* fetch the process-name: */
  exe_name[0] = '\0';
  snprintf (name, sizeof (name), "/proc/%d/status", pid);
  fp = fopen (name, "r");
  if (fp)
    {
      int got_name = 0, got_tgid = 0;

      while (!feof (fp))
	if (getline (&line, &line_size, fp) > 0)
	  {
	    if (sscanf (line, "Name: %1024s\n", exe_name) == 1)
	      got_name = 1;
	    else if (sscanf (line, "Tgid: %d\n", &tgid) == 1)
	      got_tgid = 1;
	    if (got_name && got_tgid)
	      break;
	  }
      fclose (fp);

      /* replace path-separators with underbars: */
      for (cp = exe_name; *cp; ++cp)
	if (*cp == '/')
	  *cp = '_';
    }

  if (line)
    {
      free (line);
      line = NULL;
    }

  /* Note that we cheat here: we use the thread-group ID but call it
     the PID.  This is so that multiple threads will map to the same
     profile-name (apart from version number).  */
  if (exe_name[0])
    snprintf (name, sizeof (name), "%s-pid%d-cpu%d",
	      exe_name, tgid, my_cpu);
  else
    {
      if (merge_unknown_addr_spaces)
	{
	  if (!unknown_addr_space)
	    {
	      entire_addr_space.next = NULL;
	      entire_addr_space.start = 0;
	      entire_addr_space.size = KERNEL_START;
	      entire_addr_space.offset = 0;
	      entire_addr_space.path = NULL;

	      snprintf (name, sizeof (name), "unknown-cpu%d", my_cpu);
	      unknown_addr_space = addr_space_create (0, 0, name, NULL);
	      unknown_addr_space->mappings = &entire_addr_space;
	    }
	  return unknown_addr_space;
	}
      else
	snprintf (name, sizeof (name), "unknown-pid%d-cpu%d", tgid, my_cpu);
    }
  return addr_space_create (pid, tgid, name, argv);
}

static void
update_mappings (struct addr_space *as)
{
  struct mapping *m, *m_next, *list = NULL, *list_end = NULL;
  char buf[PATH_MAX], *line = NULL;
  size_t line_size = 0;
  FILE *fp;

  snprintf (buf, sizeof (buf), "/proc/%d/maps", as->pid);
  fp = fopen (buf, "r");
  if (!fp)
    /* process may be gone already */
    return;

  /* Destroy existing list of mappings.  */
  for (m = as->mappings; m; m = m_next)
    {
      if (m->path)
	free ((void *) m->path);
      m_next = m->next;
      free (m);
    }

  while (getline (&line, &line_size, fp) > 0)
    {
      unsigned long start, end, offset;
      char path[4096], perm[4];

      path[0] = '\0';
      if (sscanf (line, "%lx-%lx %4s %lx %*x:%*x %*u %4096s\n",
		  &start, &end, perm, &offset, path) >= 4)
	{
	  if (strncmp (perm, "---", 3) == 0)
	    /* ignore inaccessible mappings */
	    continue;

#if 0
	  printf ("%s: pid %d start=%lx size=%lu off=%lx path=%s\n",
		  __FUNCTION__, as->pid, start, end - start, offset, path);
#endif
	  m = malloc (sizeof (*m));
	  if (!m)
	    {
	      fprintf (stderr, "%s: failed to allocate %Zu bytes (%s)\n",
		       prog_name, sizeof (*m), strerror (errno));
	      break;
	    }

	  m->next = NULL;
	  m->start = start;
	  m->size = end - start;
	  m->offset = offset;
	  m->path = strdup (path);
	  if (list_end)
	    {
	      list_end->next = m;
	      list_end = m;
	    }
	  else
	    list = list_end = m;
	}
    }
  if (line)
    free (line);
  fclose (fp);
  as->mappings = list;
}

static void
check_addr_space_mapping (struct addr_space *as, unsigned long addr)
{
  struct mapping *m, *prev = (struct mapping *) &as->mappings;

  if (addr - KERNEL_START < KERNEL_SIZE)
    /* kernel address --- no need to do anything */
    return;

  for (m = as->mappings; m; prev = m, m = m->next)
    if (addr - m->start < m->size)
      {
	if (m != as->mappings)
	  {
	    /* move to front of list */
	    prev->next = m->next;
	    m->next = as->mappings;
	    as->mappings = m;
	  }
	return;
      }
  /* Unknown mapping.  Update the list of mappings.  */
  update_mappings (as);
}

static inline uint64_t
get_btb_ip (int regnum, pfm_ita2_pmd_reg_t reg, pfm_ita2_pmd_reg_t pmd16)
{
  /* We can ignore the slot-number here because functions must start
     and end at a bundle-boundary.  */
  uint64_t ip = (reg.pmd8_15_ita2_reg.btb_addr << 4);

  if (pmd16.pmd_val & (1UL << (4 + 4*regnum)))
    ip += 0x10;

  return ip;
}

static char *
process_btb_sample (struct addr_space *as,
		    pfm_default_smpl_entry_t *ent, char *pos)
{
  pfm_ita2_pmd_reg_t pmd16;
  uint64_t j, src_ip, dst_ip, last, interval;
  pfm_ita2_pmd_reg_t *reg, src, dst;
  unsigned long weight;
  int n;

  reg = (pfm_ita2_pmd_reg_t *) pos;

  interval = -ent->last_reset_val;

  pmd16 = reg[8];

  last = pmd16.pmd16_ita2_reg.btbi_bbi;
  j = pmd16.pmd16_ita2_reg.btbi_full ? pmd16.pmd16_ita2_reg.btbi_bbi : 0;

#if 0
  printf ("first=%lu, last=%lu, pmd16=%lx, interval=%ld\n",
	  j, last, pmd16.pmd_val, interval);
  {
    int k;
    for (k = 0; k < 9; ++k)
      printf ("\treg[%d] = %016lx\n", k, reg[k].pmd_val);
  }
#endif

  /* count the number of branches recorded in the BTB: */
  n = 0;
  do
    {
      src = reg[j];
      dst = reg[(j + 1) & 7];

      if (src.pmd8_15_ds_ita2_reg.btb_b)
	{
	  ++n;
	  if (!dst.pmd8_15_ds_ita2_reg.btb_b)
	    j = (j + 1) & 7;
	  if (j == last)
	    break;
	}
      j = (j + 1) & 7;
    }
  while (j != last);

  j =  pmd16.pmd16_ita2_reg.btbi_full ? pmd16.pmd16_ita2_reg.btbi_bbi : 0;

  if (n > 0)
    {
#if 0
      n = 1;
#endif
      weight = (2*interval + n) / (2*n);

      while (n-- > 0)
	{
	  src = reg[j];
	  dst = reg[(j + 1) & 7];

	  /* Ensure "src" recorded a branch instruction (btb_b set to
	     0), not a branch target.  */
	  if (src.pmd8_15_ds_ita2_reg.btb_b)
	    {
	      src_ip = get_btb_ip (j, src, pmd16);
	      dst_ip = get_btb_ip ((j + 1) & 7, dst, pmd16);

	      /* Note that we're capturing br.ret, so a return from
		 SRC to DST gets count as a call from DST to
		 SRC...  */
	      call_count_add (as->cc, dst_ip, src_ip, weight);
	      check_addr_space_mapping (as, src_ip);
	      check_addr_space_mapping (as, dst_ip);

	      /* If destination address was recorded by a branch
		 target entry (btb_b set to 0), skip over that entry
		 as well.  */
	      if (!dst.pmd8_15_ds_ita2_reg.btb_b)
		j = (j + 1) & 7;
	    }
	  j = (j + 1) & 7;
	}
    }
  return (char *) &reg[9];	/* XXX must be 16-byte aligned??? */
}

inline static void
dump_btb (pfm_ita2_pmd_reg_t *pmd)
{
  unsigned long reg, i, bbi, num_regs;
  pfm_ita2_pmd_reg_t pmd16;

  pmd16 = pmd[8];
  bbi = pmd16.pmd16_ita2_reg.btbi_bbi;

  if (pmd16.pmd16_ita2_reg.btbi_full)
    num_regs = 8;
  else
    num_regs = bbi + 1;
  i = (bbi + 7) % 8;
  printf ("--- bbi=%lu\n", bbi);
  for (reg = 0; reg < num_regs; ++reg)
    {
      printf ("BTB[%lu] = %016lx\n", i, *(unsigned long *)&pmd[i]);
      i = (i + 7) % 8;
    }
}

static uint64_t
get_ip_from_btb (pfm_default_smpl_entry_t *ent, char **posp)
{
  unsigned long reg, num_regs, i, j, bbi, src_addr, dst_addr;
  pfm_ita2_pmd_reg_t *pmd, src, dst, pmd16;

  pmd = (pfm_ita2_pmd_reg_t *) *posp;
  *posp = (char *) &pmd[9];

  /* find the most recently recorded branch in the BTB: */
  pmd16 = pmd[8];
  bbi = pmd16.pmd16_ita2_reg.btbi_bbi;

  if (pmd16.pmd16_ita2_reg.btbi_full)
    num_regs = 8;
  else
    num_regs = bbi + 1;
  i = (bbi + 7) % 8;
  for (reg = 0; reg < num_regs; ++reg)
    {
      src = pmd[i];
      if (src.pmd8_15_ita2_reg.btb_b)
	{
	  /* Found the register recording the most-recent branch
	     instruction.  */

	  /* As a special case, if the most recent branch was a taken
	     branch from the last slot in a bundle to the next bundle,
	     we treat it as an "rfi" marker and ignore it.  */
	  if (src.pmd8_15_ita2_reg.btb_slot == 2 && reg > 0)
	    {
	      j = (i + 1) & 7;
	      dst = pmd[j];
	      src_addr = get_btb_ip (i, src, pmd16);
	      dst_addr = get_btb_ip (j, dst, pmd16);
	      if (dst_addr - src_addr == 0x10)
		return ent->ip | 0xd;
	    }

	  if (src.pmd8_15_ita2_reg.btb_slot != 3 && reg > 0)
	    {
	      /* the branch was taken and we have a record of the
		 branch destination; record that instead */
	      i = (i + 1) & 7;
	      src = pmd[i];
	    }
	  break;
	}
      i = (i + 7) % 8;
    }
  if (reg >= num_regs)
    {
      /* BTB didn't record any branches!  Use IP from interrupt
	 instead and mark bits 0-3 with "0xc" so we can distinguish
	 the sample from BTB-samples. */
      return ent->ip | 0xc;	/* or in with special marker... */
    }
  return get_btb_ip (i, src, pmd16);
}

static char *
process_code_sample (struct addr_space *as,
		     pfm_default_smpl_entry_t *ent, char *pos)
{
  unsigned long interval = -ent->last_reset_val;
  uint64_t ip;

  cs_interval_sum += interval;
  ++cs_num_samples;

  if (monitor_interruptions)
    ip = get_ip_from_btb (ent, &pos);
  else
    ip = ent->ip;

  time_hist_add (as->hist, ip, interval);
  check_addr_space_mapping (as, ent->ip);
  return pos;
}

static int
adjust_rate (pfarg_reg_t *cdesc, double current_rate, double goal_rate)
{
  unsigned long counter, range_mask;

  if (0.5*goal_rate <= current_rate && current_rate <= 2.0*goal_rate)
    /* No need to adjust rate.  */
    return 0;

  /* If the rate is grossly wrong, switch into fast adjustment mode.  */
  if (current_rate < goal_rate/8.0 || current_rate > 8.0*goal_rate)
    poll_timeout = MIN_POLL_TIMEOUT;

  counter = -cdesc->reg_short_reset;

  if (current_rate < 0.5*goal_rate)
    {
      counter /= 2;
      if (!counter)
	counter = 1;
    }
  else if (current_rate < 0.9*goal_rate)
    {
      counter *= 0.9;
      if (!counter)
	counter = 1;
    }
  else if (current_rate > 2*goal_rate)
    {
      if (2*counter <= counter)
	/* on overflow, max out at ~0UL: */
	counter = ~0UL;
      else
	counter *= 2;
    }
  else if (current_rate > 1.1*goal_rate)
    {
      if (1.1*counter <= counter)
	/* on overflow, max out at ~0UL: */
	counter = ~0UL;
      else
	counter *= 1.1;
    }

  if (counter / 4)
    range_mask = (1UL << fls (counter / 4)) - 1;
  else
    range_mask = 0;

#if 0
  printf ("new counter = %lu +/- %lu\n", counter + range_mask/2, range_mask/2);
#endif

  /*
   * XXX eventually, we may want to be smarter about picking the
   * long_reset/initial values to let the machine quiet down before
   * resuming sampling.  But it's not clear how to make a good choice
   * for arbitrary counters.
   */
  cdesc->reg_value = -counter;
  cdesc->reg_long_reset = -counter;
  cdesc->reg_short_reset = -counter;
#if 0
  cdesc->reg_random_seed = last_reset_value;
#endif
  cdesc->reg_random_mask = range_mask;
  return 1;
}

static void
process_pmu_samples (void)
{
  pfm_default_smpl_hdr_t *hdr = sampling_buffer;
  pfm_default_smpl_entry_t *ent;
  unsigned long i, num_btb_samples = 0, num_code_samples = 0;
  struct addr_space *as = NULL;
  int need_adjustment;
  pid_t pid = -1;
  double delta;
  char *pos;

  last_time = now;
  gettimeofday (&now, NULL);

#if 0
  printf ("%s(count=%lu)\n", __FUNCTION__, hdr->hdr_count);
#endif

  if (hdr->hdr_version != PFM_DEFAULT_SMPL_VERSION)
    panic ("pfmlib v%u.%u sampling format is not supported\n",
	   PFM_VERSION_MAJOR (hdr->hdr_version),
	   PFM_VERSION_MINOR (hdr->hdr_version));

  pos = (char *) (hdr + 1);

  for (i = 0; i < hdr->hdr_count; ++i)
    {
      ent = (pfm_default_smpl_entry_t *) pos;
      pos = (char *) (ent + 1);

      if (pid != ent->pid)
	{
	  pid = ent->pid;
	  as = get_addr_space (pid);
	}

      if (ent->ovfl_pmd == branch_event_pmd)
	{
	  ++num_btb_samples;
	  pos = process_btb_sample (as, ent, pos);
	}
      else if (ent->ovfl_pmd == cs_event_pmd)
	{
	  ++num_code_samples;
	  pos = process_code_sample (as, ent, pos);
	}
      else
	panic ("Unknown PMD register %u overflowed\n", ent->ovfl_pmd);
    }

  delta = ((now.tv_sec + 1e-6 * now.tv_usec)
	   - (last_time.tv_sec + 1e-6 * last_time.tv_usec));

  if (verbose)
    printf ("[%d] collected %lu BTB samples, %lu code samples in %g seconds\n",
	    getpid (), num_btb_samples, num_code_samples, delta);

  poll_timeout = MAX_POLL_TIMEOUT;
  need_adjustment = 0;
  if (branch_event_pd)
    need_adjustment |= adjust_rate (branch_event_pd, num_btb_samples / delta,
				    call_sample_rate);
  if (cs_pd)
    need_adjustment |= adjust_rate (cs_pd, num_code_samples / delta,
				    code_sample_rate);
  if (need_adjustment
      && perfmonctl (sampling_fd, PFM_WRITE_PMDS, pd, num_pds) < 0)
    panic ("perfmonctl(PFM_WRITE_PMDS) failed (%s)\n", strerror (errno));
}

static void
sigterm_handler (int signum)
{
  if (verbose)
    printf ("[%d] termination-request received; shutting down\n", getpid ());
  done = 1;
}

static int
time_hist_write (void *arg, uintptr_t addr, size_t len, unsigned long *counts)
{
  FILE *fp = arg;
  size_t i;

  for (i = 0; i < len; ++i)
    {
      if (counts[i] == 0)
	continue;

      fprintf (fp, "0x%lx %lu\n", addr + (i << INSN_SHIFT), counts[i]);
    }
  return 0;
}

static int
call_count_write (void *arg, uintptr_t from, uintptr_t to, size_t count)
{
  FILE *fp = arg;

  fprintf (fp, "0x%lx 0x%lx %lu\n", from, to, count);
  return 0;
}

static int
elf_valid_file (const char *path)
{
  char buf[5];
  int fd = open (path, O_RDONLY);

  if (fd < 0)
    return 0;

  if (read (fd, buf, sizeof (buf)) != sizeof (buf))
    return 0;

  close (fd);

  return memcmp (buf, ELFMAG, SELFMAG) == 0
    && (buf[EI_CLASS] == ELFCLASS32 || buf[EI_CLASS] == ELFCLASS64);
}

static void
write_argv (FILE *fp, const char *key, const char **argv)
{
  int i, j, ch, not_first;

  fprintf (fp, "(%s . (", key);

  for (not_first = i = 0; argv[i]; ++i)
    {
      if (not_first)
	fputc (' ', fp);
      fputc ('"', fp);
      for (j = 0; (ch = argv[i][j]) != '\0'; ++j)
	{
	  if (ch == '"' || ch == '\\')
	    fputc ('\\', fp);
	  fputc (ch, fp);
	}
      fputc ('"', fp);
      not_first = 1;
    }
  fprintf (fp, "))");
}

static void
write_profile (struct addr_space *as, const char *kallsyms)
{
  const char *hist_file_name, *edge_file_name, *cp;
  FILE *info, *hist, *edge = NULL;
  struct mapping *m, *n;
  char buf[PATH_MAX];

  snprintf (buf, sizeof (buf), "%s.info", as->name);
  info = q_create_file (buf, sizeof (buf));
  if (!info)
    {
      fprintf (stderr, "%s: couldn't open info file %s\n", prog_name, buf);
      return;
    }

  /* dump the command-line(s) to the info file: */
  fprintf (info, "(q:info '(");
  write_argv (info, "q:syscollect-cmdline", prog_argv);
  if (as->argv)
    {
      fprintf (info, "\n          ");
      write_argv (info, "q:cmdline", as->argv);
    }
  fprintf (info, "))\n");

  snprintf (buf, sizeof (buf), "%s.hist", as->name);
  hist = q_create_file (buf, sizeof (buf));
  if (!hist)
    {
      fprintf (stderr, "%s: couldn't open histogram file\n", prog_name);
      return;
    }
  hist_file_name = strdup (buf);

  fprintf (info,
	   "(q:histogram '((q:file . \"%s\")\n"
	   "               (q:event-name . \"%s\")\n"
	   "               (q:x-unit-label . \"address\")\n",
	   hist_file_name, cs_event_name);
  if (strcmp (cs_event_name, "CPU_CYCLES") == 0)
    fprintf (info,
	     "               (q:y-unit-label . \"seconds\")\n"
	     "               (q:y-unit-conversion-factor . %g)\n",
	     1 / (double) cycle_frequency);
  fprintf (info,
	   "               (q:y-granularity . %g)))\n",
	   cs_interval_sum / (double) cs_num_samples / cycle_frequency);

  if (branch_event_pd)
    {
      snprintf (buf, sizeof (buf), "%s.edge", as->name);
      edge = q_create_file (buf, sizeof (buf));
      if (!edge)
	{
	  fprintf (stderr, "%s: couldn't open edge-count file\n", prog_name);
	  return;
	}
      edge_file_name = strdup (buf);
      fprintf (info, "(q:call-counts '((q:file . \"%s\")))\n", edge_file_name);
      call_count_extract (as->cc, call_count_write, edge);
    }

  time_hist_extract (as->hist, time_hist_write, hist);

  if (kallsyms)
    fprintf (info, "(q:kallsyms \"%s\")\n", kallsyms);

  for (m = as->mappings; m; m = m->next)
    {
      struct stat statbuf;
      int first = 1;

      if (m->path && m->path[0])
	{
	  if (stat (m->path, &statbuf) < 0 || !S_ISREG (statbuf.st_mode)
	      || !elf_valid_file (m->path))
	    /* skip mappings for special files and non-ELF files... */
	    continue;

	  cp = strrchr (m->path, '/');
	  if (cp)
	    ++cp;
	  else
	    cp = m->path;

	  if (q_checksummed_link (Q_LINK_ANY, buf, sizeof (buf), cp, m->path)
	      != 0)
	    continue;

	  fprintf (info,
		   "(q:object '((q:name . \"%s\")\n"
		   "            (q:file . \"%s\")\n"
		   "            (q:maps . (", cp, buf);
	  for (n = as->mappings; n; n = n->next)
	    {
	      if (!n->path)
		continue;
	      if (strcmp (m->path, n->path) != 0)
		continue;

	      if (!first)
		fprintf (info, "\n                        ");
	      first = 0;

	      fprintf (info, "((q:addr . #x%lx) (q:size . %lu) "
		       "(q:offset . #x%lx))", n->start, n->size, n->offset);
	      if (m != n)
		{
		  free ((void *) n->path);
		  n->path = NULL;	/* mark this mapping as written */
		}
	    }
	  free ((void *) m->path);
	  m->path = NULL;
	  fprintf (info, "))))\n");
	}
    }

  q_close (info);
  q_close (hist);
  if (edge)
    q_close (edge);
}

static void
setup_call_count_sampling (pfarg_reg_t *pc, int *num_pcsp,
			   pfarg_reg_t *pd, int *num_pdsp,
			   unsigned int privilege_level_mask)
{
  int j, ret, type, num_pcs = *num_pcsp, num_pds = *num_pdsp;
  pfmlib_ita2_input_param_t i2param;
  pfmlib_input_param_t iparam;
  pfmlib_output_param_t oparam;
  unsigned long reset_value;
  unsigned long i;

  pfm_get_pmu_type (&type);
  if (type != PFMLIB_ITANIUM2_PMU)
    {
      char model[80];
      pfm_get_pmu_name (model, sizeof (model));
      fprintf (stderr, "Call-count sampling not supported by %s PMU; "
	       "disabling it.\n", model);
      return;		/* not a fatal error... */
    }

  memset (&iparam, 0, sizeof (&iparam));
  memset (&i2param, 0, sizeof (i2param));

  i2param.pfp_ita2_btb.btb_used = 1;
  i2param.pfp_ita2_btb.btb_tm  = 0x2;	/* capture taken-branches only */
  i2param.pfp_ita2_btb.btb_ptm = 0x3;	/* capture regardless of tgt pred. */
  i2param.pfp_ita2_btb.btb_ppm = 0x3;	/* capture regardless of path pred. */
  i2param.pfp_ita2_btb.btb_brt = 0x2;	/* capture only return branches */
  i2param.pfp_ita2_btb.btb_plm = privilege_level_mask;

  iparam.pfp_dfl_plm = privilege_level_mask;
  iparam.pfp_event_count = 1;
  /* for system-wide monitoring we must use privileged monitors: */
  iparam.pfp_flags = PFMLIB_PFP_SYSTEMWIDE;
  if (pfm_find_event_byname ("BRANCH_EVENT", &iparam.pfp_events[0].event)
      != PFMLIB_SUCCESS)
    panic ("pfm_find_event_byname: failed to find BRANCH_EVENT\n");

  memset (&oparam, 0, sizeof (oparam));
  ret = pfm_dispatch_events (&iparam, &i2param, &oparam, NULL);
  if (ret != PFMLIB_SUCCESS)
    panic ("pfm_dispatch_events(): %s\n", pfm_strerror (ret));

  /* Now setup the PMC and PMD descriptors: */

  for (i = 0; i < oparam.pfp_pmc_count; ++i)
    {
      for (j = 0; j < num_pcs; ++j)
	if (oparam.pfp_pmcs[i].reg_num == pc[j].reg_num)
	  panic ("%s: PMC%d is already busy!\n", __FUNCTION__, pc[j].reg_num);
      pc[num_pcs + i].reg_num = oparam.pfp_pmcs[i].reg_num;
      pc[num_pcs + i].reg_value = oparam.pfp_pmcs[i].reg_value;
    }

  /* set the PMD reg # for BRANCH_EVENT */
  pd[num_pds].reg_num = pc[num_pcs].reg_num;
  branch_event_pd = pd + num_pds;
  branch_event_pmd = pd[num_pds].reg_num;

  /* Specify what happens when the BRANCH_EVENT counter wraps-around: */

  pc[num_pcs].reg_smpl_pmds[0] = BTB_REGS_MASK;
  pc[num_pcs].reg_flags |= PFM_REGFL_OVFL_NOTIFY; /* notify on BRANCH_EVENT */
  pc[num_pcs].reg_flags |= PFM_REGFL_RANDOM;	/* randomize the interval */
  /* clear BTB index (PMD16):  */
  pc[num_pcs].reg_reset_pmds[0] = REG_MASK (16);

  reset_value = -100000;
  pd[num_pds].reg_value = reset_value;		/* initial value */
  pd[num_pds].reg_long_reset = reset_value;	/* min-long-interval */
  pd[num_pds].reg_short_reset = reset_value;	/* min-short-interval */
  pd[num_pds].reg_random_seed = 0xc0ffee;	/* seed */
  pd[num_pds].reg_random_mask = 0x3ff;		/* mask */

  /* Define the reset value for PMD16: */
  pd[num_pds + 1].reg_num = 16;
  pd[num_pds + 1].reg_value = 0;
  pd[num_pds + 1].reg_long_reset = 0;
  pd[num_pds + 1].reg_short_reset = 0;

  /* Commit the new pc/pd structures: */
  *num_pcsp += oparam.pfp_pmc_count;
  *num_pdsp += 2;
}

static void
setup_code_sampling (pfarg_reg_t *pc, int *num_pcsp,
		     pfarg_reg_t *pd, int *num_pdsp,
		     unsigned int privilege_level_mask)
{
  int j, ret, num_pcs = *num_pcsp, num_pds = *num_pdsp;
  pfmlib_input_param_t iparam;
  pfmlib_output_param_t oparam;
  unsigned long reset_value;
  unsigned long i;

  memset (&iparam, 0, sizeof (&iparam));

  iparam.pfp_dfl_plm = privilege_level_mask;
  iparam.pfp_event_count = 1;
  /* for system-wide monitoring we must use privileged monitors: */
  iparam.pfp_flags = PFMLIB_PFP_SYSTEMWIDE;
#if GROSS_HACK
  ++iparam.pfp_event_count;
  if (pfm_find_event_byname ("BRANCH_EVENT", &iparam.pfp_events[0].event)
      != PFMLIB_SUCCESS)
    panic ("pfm_find_event_byname: failed to find BRANCH_EVENT\n");
  if (pfm_find_event_byname (cs_event_name, &iparam.pfp_events[1].event)
      != PFMLIB_SUCCESS)
    panic ("pfm_find_event_byname: failed to find %s\n", cs_event_name);
#else
  if (pfm_find_event_byname (cs_event_name, &iparam.pfp_events[0].event)
      != PFMLIB_SUCCESS)
    panic ("pfm_find_event_byname: failed to find %s\n", cs_event_name);
#endif

  memset (&oparam, 0, sizeof (oparam));
  ret = pfm_dispatch_events (&iparam, NULL, &oparam, NULL);
  if (ret != PFMLIB_SUCCESS)
    panic ("pfm_dispatch_events(): %s\n", pfm_strerror (ret));

#if GROSS_HACK
  oparam.pfp_pmc_count = 1;
  oparam.pfp_pmcs[0] = oparam.pfp_pmcs[1];
#endif

  /* Now setup the PMC and PMD descriptors: */

  for (i = 0; i < oparam.pfp_pmc_count; ++i)
    {
      for (j = 0; j < num_pcs; ++j)
	{
	  if (oparam.pfp_pmcs[i].reg_num == pc[j].reg_num)
	    panic ("%s: PMC%d is already busy!\n",
		   __FUNCTION__, pc[j].reg_num);
	}
      pc[num_pcs + i].reg_num = oparam.pfp_pmcs[i].reg_num;
      pc[num_pcs + i].reg_value = oparam.pfp_pmcs[i].reg_value;
    }

  /* set the PMD reg # for the code-sampling event */
  pd[num_pds].reg_num = pc[num_pcs].reg_num;
  cs_pd = pd + num_pds;
  cs_event_pmd = pd[num_pds].reg_num;

  /* Specify what happens when the code sampling event-counter wraps-around: */

  pc[num_pcs].reg_smpl_pmds[0] = 0;		/* don't sample any PMDs */
  pc[num_pcs].reg_flags |= PFM_REGFL_OVFL_NOTIFY; /* notify on overflow */
  pc[num_pcs].reg_flags |= PFM_REGFL_RANDOM;	/* randomize the interval */

  if (strcmp (cs_event_name, "CPU_CYCLES") == 0)
    reset_value = - (long) (cycle_frequency / code_sample_rate);
  else
    reset_value = -100000;	/* your guess is as good as mine... */

  pd[num_pds].reg_value = reset_value;		/* initial value */
  pd[num_pds].reg_long_reset = reset_value;	/* min-long-interval */
  pd[num_pds].reg_short_reset = reset_value;	/* min-short-interval */
  pd[num_pds].reg_random_seed = 0xf00d;		/* seed */
  pd[num_pds].reg_random_mask = 0x3ff;		/* mask */

  /* Commit the new pc/pd structures: */
  *num_pcsp += oparam.pfp_pmc_count;
  *num_pdsp += 1;
}

static void
setup_btb_code_sampling (pfarg_reg_t *pc, int *num_pcsp,
			 pfarg_reg_t *pd, int *num_pdsp,
			 unsigned int privilege_level_mask)
{
  int j, ret, type, num_pcs = *num_pcsp, num_pds = *num_pdsp;
  pfmlib_ita2_input_param_t i2param;
  pfmlib_input_param_t iparam;
  pfmlib_output_param_t oparam;
  unsigned long reset_value;
  unsigned long i;

  pfm_get_pmu_type (&type);
  if (type != PFMLIB_ITANIUM2_PMU)
    {
      char model[80];
      pfm_get_pmu_name (model, sizeof (model));
      fprintf (stderr, "Interruption monitoring not supported by %s PMU; "
	       "disabling it.\n", model);
      return;		/* not a fatal error... */
    }

  memset (&iparam, 0, sizeof (&iparam));
  memset (&i2param, 0, sizeof (i2param));

  i2param.pfp_ita2_btb.btb_used = 1;
  i2param.pfp_ita2_btb.btb_tm  = 0x3;	/* capture regardless whether taken */
  i2param.pfp_ita2_btb.btb_ptm = 0x3;	/* capture regardless of tgt pred. */
  i2param.pfp_ita2_btb.btb_ppm = 0x3;	/* capture regardless of path pred. */
  i2param.pfp_ita2_btb.btb_brt = 0x0;	/* capture all branch-types */
  i2param.pfp_ita2_btb.btb_plm = privilege_level_mask;

  iparam.pfp_dfl_plm = privilege_level_mask;
  iparam.pfp_event_count = 1;
  /* for system-wide monitoring we must use privileged monitors: */
  iparam.pfp_flags = PFMLIB_PFP_SYSTEMWIDE;
  if (pfm_find_event_byname (cs_event_name, &iparam.pfp_events[0].event)
      != PFMLIB_SUCCESS)
    panic ("pfm_find_event_byname: failed to find %s\n", cs_event_name);

  memset (&oparam, 0, sizeof (oparam));
  ret = pfm_dispatch_events (&iparam, &i2param, &oparam, NULL);
  if (ret != PFMLIB_SUCCESS)
    panic ("pfm_dispatch_events(): %s\n", pfm_strerror (ret));

  /* Now setup the PMC and PMD descriptors: */

  for (i = 0; i < oparam.pfp_pmc_count; ++i)
    {
      for (j = 0; j < num_pcs; ++j)
	if (oparam.pfp_pmcs[i].reg_num == pc[j].reg_num)
	  panic ("%s: PMC%d is already busy!\n", __FUNCTION__, pc[j].reg_num);
      pc[num_pcs + i].reg_num = oparam.pfp_pmcs[i].reg_num;
      pc[num_pcs + i].reg_value = oparam.pfp_pmcs[i].reg_value;
    }

  /* set the PMD reg # for code-sampling event */
  pd[num_pds].reg_num = pc[num_pcs].reg_num;
  cs_pd = pd + num_pds;
  cs_event_pmd = pd[num_pds].reg_num;

  /* Specify what happens when the code sampling event-counter wraps-around: */

  pc[num_pcs].reg_smpl_pmds[0] = BTB_REGS_MASK;
  pc[num_pcs].reg_flags |= PFM_REGFL_OVFL_NOTIFY; /* notify on overflow */
  pc[num_pcs].reg_flags |= PFM_REGFL_RANDOM;	/* randomize the interval */
  /* clear BTB index (PMD16):  */
  pc[num_pcs].reg_reset_pmds[0] = REG_MASK (16);

  if (strcmp (cs_event_name, "CPU_CYCLES") == 0)
    reset_value = - (long) (cycle_frequency / code_sample_rate);
  else
    reset_value = -100000;	/* your guess is as good as mine... */

  pd[num_pds].reg_value = reset_value;		/* initial value */
  pd[num_pds].reg_long_reset = reset_value;	/* min-long-interval */
  pd[num_pds].reg_short_reset = reset_value;	/* min-short-interval */
  pd[num_pds].reg_random_seed = 0xc0ffee;	/* seed */
  pd[num_pds].reg_random_mask = 0x3ff;		/* mask */

  /* Define the reset value for PMD16: */
  pd[num_pds + 1].reg_num = 16;
  pd[num_pds + 1].reg_value = 0;
  pd[num_pds + 1].reg_long_reset = 0;
  pd[num_pds + 1].reg_short_reset = 0;

  /* Commit the new pc/pd structures: */
  *num_pcsp += oparam.pfp_pmc_count;
  *num_pdsp += 2;
}

unsigned long
get_cycle_frequency (void)
{
  unsigned long mhz, frac;
  FILE *fp;

  fp = fopen ("/proc/cpuinfo", "r");
  if (!fp)
    {
      perror ("/proc/cpuinfo");
      prog_stop (-1);
    }
  while (fscanf (fp, "cpu MHz : %lu.%lu\n", &mhz, &frac) != 2)
    {
      char buf[8192];
      fgets (buf, sizeof (buf), fp);
    }
  fclose (fp);
  return mhz * 1000000 + frac;
}

static void
profile_cpu (int cpu)
{
  pfm_default_smpl_ctx_arg_t context;
  pfarg_reg_t pc[PFMLIB_MAX_PMCS];
  int num_pcs, ret, buffer_full;
  size_t sampling_buffer_size;
  pfmlib_options_t options;
  pfarg_load_t load_args;
  struct addr_space *as;
  char buf[PATH_MAX];
  unsigned int plm;
  char *kallsyms;
  pfm_msg_t msg;
  struct pollfd pfd;
  struct rlimit rlim;

  cycle_frequency = get_cycle_frequency ();

  if (verbose)
    printf ("[%d] profiling on cpu %d with clock-frequency %lu Hz\n",
	    getpid (), cpu, cycle_frequency);

  if (pfm_initialize () != PFMLIB_SUCCESS)
    panic ("Can't initialize perfmon library (pfm_initialize() failed)\n");

  memset (&options, 0, sizeof (options));
  options.pfm_debug = 0;	/* set to 1 for debug */
  options.pfm_verbose = 0;	/* set to 1 for debug */
  pfm_set_options (&options);

  num_pcs = num_pds = 0;
  memset (pc, 0, sizeof (pc));
  memset (pd, 0, sizeof (pd));

  /* The order matters here: we want to set up call-count sampling
     first, so that it gets a lower-numbered PMD, because we need the
     last reset value, which the default sampling format stores only
     for the lowest-numbered PMD.  */
  plm = 0;
  if (monitor_kernel)
    plm |= PFM_PLM0;
  if (monitor_user)
    plm |= PFM_PLM3;

  if (monitor_interruptions)
    setup_btb_code_sampling (pc, &num_pcs, pd, &num_pds, plm);
  else
    {
      setup_call_count_sampling (pc, &num_pcs, pd, &num_pds, plm);
      setup_code_sampling (pc, &num_pcs, pd, &num_pds, plm);
    }

  /* Try to up RLIMIT_MEMLOCK to something that lets us create a
     reasonably large sampling-buffer.  If that's not possible, try to
     use the maximum possible.  */

  sampling_buffer_size = SAMPLING_BUFFER_SIZE;
  if (getrlimit (RLIMIT_MEMLOCK, &rlim) < 0)
    {
      perror ("getrlimit");
      exit (-1);
    }
  if (rlim.rlim_cur < sampling_buffer_size)
    {
      if (rlim.rlim_max < sampling_buffer_size)
	sampling_buffer_size = rlim.rlim_max;

      rlim.rlim_cur = sampling_buffer_size;
      if (setrlimit (RLIMIT_MEMLOCK, &rlim) < 0)
	{
	  perror ("setrlimit");
	  exit (-1);
	}
    }

  /* Create the perfmon context: */

  memset (&context, 0, sizeof (context));
  memcpy (context.ctx_arg.ctx_smpl_buf_id, (pfm_uuid_t) PFM_DEFAULT_SMPL_UUID,
	  sizeof (pfm_uuid_t));
  context.buf_arg.buf_size = sampling_buffer_size;
  context.ctx_arg.ctx_flags = PFM_FL_SYSTEM_WIDE;
  if (perfmonctl (0, PFM_CREATE_CONTEXT, &context, 1) == -1)
    {
      if (errno == ENOSYS)
	panic ("The kernel doesn't have perfmon enabled.\n");
      else
	panic ("perfmonctl(PFM_CREATE_CONTEXT) failed (%s)\n",
	       strerror (errno));
    }
  sampling_buffer = context.ctx_arg.ctx_smpl_vaddr;
  sampling_fd = context.ctx_arg.ctx_fd;

  if (perfmonctl (sampling_fd, PFM_WRITE_PMCS, pc, num_pcs) < 0)
    panic ("perfmonctl(PFM_WRITE_PMCS) failed (%s)\n", strerror (errno));
  if (perfmonctl (sampling_fd, PFM_WRITE_PMDS, pd, num_pds) < 0)
    panic ("perfmonctl(PFM_WRITE_PMDS) failed (%s)\n", strerror (errno));


  memset (&load_args, 0, sizeof (load_args));
  load_args.load_pid = getpid ();
  if (perfmonctl (sampling_fd, PFM_LOAD_CONTEXT, &load_args, 1) == -1)
    panic ("perfmonctl(PFM_LOAD_CONTEXT) failed (%s)\n", strerror (errno));

  pfd.fd = sampling_fd;
  pfd.events = POLLIN;
  pfd.revents = 0;

  gettimeofday (&now, NULL);

  /* Start monitoring. We must go to the kernel because psr.pp cannot
     be changed at the user level.  */
  if (perfmonctl(sampling_fd, PFM_START, 0, 0) == -1)
    panic ("perfmonctl(PFM_START): %s\n", strerror (errno));

  my_cpu = cpu;	/* sampling is active now */

  while (1)
    {
      /* Wait for (1) sampling buffer to fill up, (2) for timeout to
	 expire, or (3) for a signal to arrive.  */
      if ((ret = poll (&pfd, 1, poll_timeout)) < 0
	  && errno != EINTR)
	panic ("poll: %s\n", strerror (errno));

      buffer_full = (ret > 0);

      if (buffer_full)
	/* read & ignore the overflow  notification message */
	read (sampling_fd, &msg, sizeof (msg));
      else if (perfmonctl (sampling_fd, PFM_STOP, 0, 0) == -1)
	panic ("perfmonctl(PFM_STOP): %s\n", strerror (errno));

      process_pmu_samples ();

      if (done)
	break;

      if (perfmonctl (sampling_fd, PFM_RESTART, NULL, 0) < 0)
	panic ("perfmonctl(PFM_RESTART)\n");

      if (!buffer_full && perfmonctl (sampling_fd, PFM_START, 0, 0) == -1)
	panic ("perfmonctl(PFM_START): %s\n", strerror (errno));
    }

  kallsyms = NULL;
  if (monitor_kernel)
    if (q_checksummed_link (Q_LINK_FORCE_COPY, buf, sizeof (buf),
			    "kallsyms", "/proc/kallsyms") == 0)
      kallsyms = buf;

  for (as = addr_space_list; as; as = as->next)
    write_profile (as, kallsyms);
}

static void
get_rate (double *ratep, const char *optname, const char *optarg)
{
  char *end;
  double rate;

  rate = strtod (optarg, &end);
  if (end != strchr (optarg, '\0'))
    {
      fprintf (stderr, "%s: %s option expects floating-point number argument",
	       prog_name, optname);
      return;
    }
  if (call_sample_rate <= 0.0)
    {
      fprintf (stderr, "%s: invalid sampling-rate %s; must be positive\n",
	       prog_name, optarg);
      return;
    }
  *ratep = rate;
}

static void
usage (FILE *fp)
{
  fprintf (fp, "Usage: %s [-hku] [-c rate] [-C rate] [-t duration] [command]\n"
	   "  -h|--help:\t\t\tPrint this help message.\n"
	   "  -c|--call-sample-rate RATE:\tSet call sampling rate to RATE"
	   " (default: %.0f/s)\n"
	   "  -C|--code-sample-rate RATE:\tSet code sampling rate to RATE"
	   " (default: %.0f/s)\n"
	   "  -i|--monitor-interruptions:\tMonitor interruptions\n"
	   "  -k|--monitor-kernel:\t\tMonitor kernel-level execution\n"
	   "  -m|--merge-unknown=[on|off]:\tMerge samples of unknown "
	   "processes (default: on)\n"
	   "  -u|--monitor-user:\t\tMonitor user-level execution\n"
	   "  -t|--sample-duration TIME:\tSample for TIME seconds"
	   " (default: %d)\n"
	   "  -e|--code-sample-event EVENT:\tUse EVENT to sample code"
	   " (default: %s)\n\n"
	   " By default, %s monitors both kernel and user-level execution.\n",
	   prog_name, call_sample_rate, code_sample_rate, duration,
	   cs_event_name, prog_name);
}

static void
check_child_status (pid_t pid, int status)
{
  if (WIFEXITED (status) && WEXITSTATUS (status) != 0)
    panic ("%s: child exited with status %d\n",
	   prog_name, WEXITSTATUS (status));
  else if (WIFSIGNALED (status))
    panic ("%s: child %d died of uncaught signal %d\n",
	   prog_name, pid, WTERMSIG (status));
}

void
prog_stop (int exit_status)
{
  int i;

  for (i = 0; i < num_cpus; ++i)
    if (child_pid[i])
      kill (child_pid[i], SIGTERM);
  exit (exit_status);
}

int
main (int argc, char **argv)
{
  pid_t master_pid = getpid ();
  int ch, i, cpu, option_index = 0;
  cpu_set_t cpu_set, all_cpus;
  struct sigaction act;
  sigset_t sigmask;
  int status;

  prog_argv = (const char **) argv;

  prog_name = strrchr (argv[0], '/');
  if (prog_name)
    ++prog_name;
  else
    prog_name = argv[0];

  while (1)
    {
      ch = getopt_long (argc, argv, "c:C:e:hikm:t:uv",
			long_opts, &option_index);
      if (ch == -1)
	break;

      switch (ch)
	{
	case 0:
	  panic ("%s: option %s not handled.",
		 prog_name, long_opts[option_index].name);

	case 'c':
	  get_rate (&call_sample_rate, long_opts[option_index].name, optarg);
	  break;

	case 'C':
	  get_rate (&code_sample_rate, long_opts[option_index].name, optarg);
	  break;

	case 'e':
	  cs_event_name = optarg;
	  break;

	case 'i':
	  monitor_interruptions = 1;
	  break;

	case 't':
	  duration = atol (optarg);
	  break;

	case 'h':
	  usage (stdout);
	  exit (1);

	case 'k':
	  monitor_kernel = 1;
	  break;

	case 'm':
	  if (!optarg || strcmp (optarg, "on") == 0)
	    merge_unknown_addr_spaces = 1;
	  else if (strcmp (optarg, "off") == 0)
	    merge_unknown_addr_spaces = 0;
	  else
	    if (option_index)
	      panic ("%s: value for `--%s' must be `on' or `off', not `%s'\n",
		     prog_name, long_opts[option_index].name, optarg);
	    else
	      panic ("%s: value for `-m' must be `on' or `off', not `%s'\n",
		     prog_name, optarg);
	  break;

	case 'u':
	  monitor_user = 1;
	  break;

	case 'v':
	  verbose = 1;
	  break;

	case ':':
	  fprintf (stderr, "%s: parameter missing\n", prog_name);
	  usage (stderr);
	  exit (1);

	case '?':
	  fprintf (stderr, "%s: Unknown option, ambiguous match, or extra "
		   "parameter\n", prog_name);
	  usage (stderr);
	  exit (1);

	default:
	  break;
	}
    }

  if (!monitor_kernel && !monitor_user)
    monitor_kernel = monitor_user = 1;

  if (!duration && optind >= argc)
    {
      usage (stderr);
      exit (1);
    }

  sigemptyset (&sigmask);

  CPU_ZERO (&cpu_set);
  CPU_ZERO (&all_cpus);

  num_cpus = sysconf (_SC_NPROCESSORS_ONLN);

  child_pid = malloc (num_cpus * sizeof (child_pid[0]));
  if (!child_pid)
    {
      perror ("malloc");
      exit (1);
    }
  memset (child_pid, 0, num_cpus * sizeof (child_pid[0]));

  memset (&act, 0, sizeof (act));
  act.sa_handler = (sig_t) sigterm_handler;
  act.sa_flags = SA_RESTART | SA_SIGINFO;
  sigaction (SIGTERM, &act, NULL);

  for (i = cpu = 0; i < num_cpus; )
    {
      if (cpu >= 8 * sizeof (cpu_set))
	/* sysconf() must have lied about _SC_NPROCESSORS_ONLN */
	break;

      CPU_SET (cpu, &cpu_set);
      if (my_sched_setaffinity (master_pid, sizeof (cpu_set), &cpu_set) == 0)
	{
	  CPU_SET (cpu, &all_cpus);
	  if ((child_pid[i] = fork ()) != 0)
	    ++i;
	  else
	    {
	      profile_cpu (cpu);
	      exit (0);
	    }
	}
      CPU_CLR (cpu, &cpu_set);
      ++cpu;
    }

  /* reset the affinity-mask so that newly forked tasks get distributed */
  if (my_sched_setaffinity (master_pid, sizeof (all_cpus), &all_cpus) == -1)
    {
      perror ("sched_setaffinity()");
      exit (-1);
    }

  if (duration > 0)
    sleep (duration);
  else if (optind < argc)
    {
      pid_t child = fork ();

      if (child > 0)
	{
	  waitpid (child, &status, 0);
	  check_child_status (child, status);
	}
      else
	{
	  execvp (argv[optind], argv + optind);
	  _exit (-1);
	}
    }

  for (i = 0; i < num_cpus; ++i)
    /* send stop-signal to children all at once (more or less) */
    kill (child_pid[i], SIGTERM);

  for (i = 0; i < num_cpus; ++i)
    {
      waitpid (child_pid[i], &status, 0);
      check_child_status (child_pid[i], status);
    }
  return 0;
}
