Logo Search packages:      
Sourcecode: salinfo version File versions

salinfo_decode.c

/*
 * decode.c - program to decode SAL error records
 *
 * Copyright (c) 2003 Hewlett-Packard Co
 *    Bjorn Helgaas <bjorn.helgaas@hp.com>
 * 2003-11-05 Add options.
 *          Handle SAL records as well as standalone raw records.
 *          Keith Owens <kaos@sgi.com>
 * 2003-11-16 Break out oem data decoder so each platform can handle the
 *          oem data as it likes.
 *          Keith Owens <kaos@sgi.com>
 * 2004-10-04 Handle kernels that clear the bit themselves when there is no data.
 *          Keith Owens <kaos@sgi.com>
 *
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */

#include <sys/types.h>
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <limits.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/wait.h>

#include "mca.h"

extern int debug;

static sal_log_record_header_t *
salinfo_buffer(int fd, int *bufsize)
{
      int nbytes, size, alloc;
      sal_log_record_header_t *buffer;

      lseek(fd, 0, 0);
      buffer = NULL;
      alloc = 16 * 1024;      // total buffer size
      size = 0;         // amount of buffer used so far
      do {
            buffer = realloc(buffer, alloc);
            if (!buffer) {
                  fprintf(stderr, "%s: Can't alloc %d bytes\n", __FUNCTION__, alloc);
                  exit(1);
            }

            nbytes = read(fd, buffer + size, alloc - size);
            if (nbytes < 0) {
                  perror("salinfo_buffer read");
                  exit(1);
            }

            if (nbytes == alloc - size)
                  alloc *= 2;

            size += nbytes;
      } while (nbytes);

      if (size) {
            if (bufsize)
                  *bufsize = size;
            return buffer;
      }

      free(buffer);
      return NULL;
}

static void
usage (void)
{
      fputs("Usage:\n"
            " salinfo_decode [-d] -t type -D directory\n"
            " salinfo_decode [-d] filename\n"
            "  -d              Increment debug level\n"
            "  -t type         Type of record to wait for (cmc, cpe, mca or init)\n"
            "  -D directory    Directory to store the raw and decoded records\n"
            "  filename        Decode a saved raw record, wthout involving SAL\n"
            , stderr);
}

/* Ben Woodard of RedHat changed the kernel salinfo code around 2.6.9-rc3 to
 * clear the cpu state bit if there is no data.  He did not add any indication
 * to user space of this change, which means that user space must deduce if it
 * is running on a kernel with or without Ben Woodward's change :(.
 *
 * Start off by assuming we are running on a changed kernel, and do not write
 * 'clear' to the kernel when there is no data.  When running on an old kernel,
 * user space will then be invoked repeatedly with no data.  Detect this loop
 * for an old kernel and turn on do_clear.
 */

static int
clear_cpu(int fd_data, int cpu, const char *data_filename, int have_data)
{
      char text[400];
      int l;
      static int prev_cpu = -1, loop = 0, do_clear = 0;

      if (have_data)
            loop = 0;
      if (!do_clear) {
            if (cpu <= prev_cpu) {
                  ++loop;
                  if (loop == 2)
                        do_clear = 1;
            }
            prev_cpu = cpu;
      }
      if (!have_data && !do_clear)
            return 0;

      snprintf(text, sizeof(text), "clear %d\n", cpu);
      l = strlen(text);
      if (write(fd_data, text, l) != l) {
            fprintf(stderr, "%s: Error writing '%s' to %s\n",
                  __FUNCTION__, text, data_filename);
            perror(data_filename);
            return 1;
      }
      return 0;
}

static int oemdata_fd[2];
static volatile int child_died;           /* lock free flag to detect child death */

static void
sig_chld (int sig)
{
      child_died = 1;
      oemdata_fd[1] = -1;
      wait(NULL);
}

/* See if this platform has supplied a program to decode oem data */
static int *
fork_oemdata(void)
{
      static const char pgm[] = "salinfo_decode_oem";
      int pid;
      int pp[2], pc[2];       /* parent writes to pp[1], child writes to pc[1] */
      if (pipe(pp) || pipe(pc)) {
            fprintf(stderr, "%s: pipe failed (%m), giving up\n", __FUNCTION__);
            exit(1);
      }
      signal(SIGCHLD, sig_chld);
      if ((pid = fork()) == 0) {
            /* child reads fd 0, writes fd 1 */
            if (dup2(pp[0], 0) < 0 || dup2(pc[1], 1) < 0) {
                  fprintf(stderr, "%s: dup2 failed (%m), giving up\n", __FUNCTION__);
                  exit(1);
            }
            close(pp[0]);
            close(pp[1]);
            close(pc[0]);
            close(pc[1]);
            execlp(pgm, pgm, NULL);
            exit(0);
      } else if (pid >= 0) {
            /* parent parent writes oemdata_fd[1], reads oemdata_fd[0] */
            close(pp[0]);
            close(pc[1]);
            oemdata_fd[0] = pc[0];  /* mca.c read from pgm */
            oemdata_fd[1] = pp[1];  /* mca.c write to pgm */
            __asm__ __volatile__ ("" ::: "memory");   /* lock free barrier */
            if (child_died)
                  oemdata_fd[1] = -1;
            return oemdata_fd;
      } else {
            fprintf(stderr, "%s: fork failed (%m), giving up\n", __FUNCTION__);
            exit(1);
      }
}

/* Talk to /proc/sal/type/{event,data} to extract, save, decode and clear SAL
 * records.
 */
static int
talk_to_sal (const char *type, const char *directory)
{
      sal_log_record_header_t *buffer;
      char event_filename[PATH_MAX], data_filename[PATH_MAX], text[200];
      int fd_event = -1, fd_data = -1, i, cpu, ret = 1;
      int *oemdata_fd = NULL;
      static const char *rd[] = { "raw", "decoded" };

      for (i = 0; i < 2; ++i) {
            int fd;
            char filename[PATH_MAX];
            snprintf(filename, sizeof(filename), "%s/%s/.check", directory, rd[i]);
            if ((fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC)) < 0) {
                  perror(filename);
                  goto out;
            }
            close(fd);
            unlink(filename);
      }

      snprintf(event_filename, sizeof(event_filename), "/proc/sal/%s/event", type);
      if ((fd_event = open(event_filename, O_RDONLY)) < 0) {
            perror(event_filename);
            goto out;
      }
      snprintf(data_filename, sizeof(data_filename), "/proc/sal/%s/data", type);
      if ((fd_data = open(data_filename, O_RDWR)) < 0) {
            perror(data_filename);
            goto out;
      }

      oemdata_fd = fork_oemdata();

      /* Run until we are killed */
      while (1) {
            int i, l, fd, bufsize, suffix;
            char filename[PATH_MAX];
            if (read(fd_event, text, sizeof(text)) <= 0) {
                  if (errno == EINTR)
                        ret = 0;
                  else
                        perror(event_filename);
                  goto out;
            }
            if (sscanf(text, "read %d\n", &cpu) != 1) {
                  fprintf(stderr, "%s: Unknown text '%s' from %s\n",
                        __FUNCTION__, text, event_filename);
                  goto out;
            }
            l = strlen(text);
            if (write(fd_data, text, l) != l) {
                  fprintf(stderr, "%s: Error writing '%s' to %s\n",
                        __FUNCTION__, text, data_filename);
                  perror(data_filename);
                  goto out;
            }
            if (!(buffer = salinfo_buffer(fd_data, &bufsize))) {
                  if (clear_cpu(fd_data, cpu, data_filename, 0))
                        goto out;
                  continue;   /* event but no data is normal at boot */
            }

            for (suffix = 0; ; ++suffix) {
                  snprintf(filename, sizeof(filename),
                        "%s/raw/%02x%02x-%02x-%02x-%02x_%02x_%02x-cpu%d-%s.%d",
                        directory,
                        buffer->timestamp.slh_century,
                        buffer->timestamp.slh_year,
                        buffer->timestamp.slh_month,
                        buffer->timestamp.slh_day,
                        buffer->timestamp.slh_hour,
                        buffer->timestamp.slh_minute,
                        buffer->timestamp.slh_second,
                        cpu,
                        type,
                        suffix);
                  if ((fd = open(filename, O_WRONLY|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR)) >= 0)
                        break;
                  if (errno != EEXIST) {
                        perror(filename);
                        goto out;
                  }
            }

            for (l = 0; l < bufsize; ) {
                  i = write(fd, (char *)buffer + l, bufsize - l);
                  if (i <= 0) {
                        perror(filename);
                        goto out;
                  }
                  l += i;
            }
            close(fd);

            snprintf(filename, sizeof(filename),
                  "%s/decoded/%02x%02x-%02x-%02x-%02x_%02x_%02x-cpu%d-%s.%d",
                  directory,
                  buffer->timestamp.slh_century,
                  buffer->timestamp.slh_year,
                  buffer->timestamp.slh_month,
                  buffer->timestamp.slh_day,
                  buffer->timestamp.slh_hour,
                  buffer->timestamp.slh_minute,
                  buffer->timestamp.slh_second,
                  cpu,
                  type,
                  suffix);
            fclose(stdout);
            if (!(stdout = fopen(filename, "a"))) {
                  perror(filename);
                  goto out;
            }
            fclose(stderr);
            if (!((stderr = fopen(filename, "a")))) {
                  perror(filename);
                  goto out;
            }

            printf("BEGIN HARDWARE ERROR STATE from %s on cpu %d\n", type, cpu);
            platform_info_print(buffer, 1, fd_data, cpu, oemdata_fd);
            printf("END HARDWARE ERROR STATE from %s on cpu %d\n", type, cpu);
            free(buffer);
            fclose(stdout);
            if (clear_cpu(fd_data, cpu, data_filename, 1))
                  goto out;

      }

out:
      if (oemdata_fd && oemdata_fd[1] > 0) {
            close(oemdata_fd[0]);
            close(oemdata_fd[1]);
            wait(NULL);
      }
      if (fd_event > 0)
            close(fd_event);
      if (fd_data > 0)
            close(fd_data);
      return ret;
}

/* Decode an existing raw file */
static int
decode_a_file (const char *filename)
{
      sal_log_record_header_t *buffer;
      int fd;
      int *oemdata_fd = NULL;
      if ((fd = open(filename, O_RDONLY)) < 0) {
            perror(filename);
            return 1;
      }
      if ((buffer = salinfo_buffer(fd, NULL))) {
            oemdata_fd = fork_oemdata();
            printf("BEGIN HARDWARE ERROR STATE from %s\n", filename);
            platform_info_print(buffer, 0, fd, -1, oemdata_fd);
            printf("END HARDWARE ERROR STATE from %s\n", filename);
      }
      if (oemdata_fd && oemdata_fd[1] > 0) {
            close(oemdata_fd[0]);
            close(oemdata_fd[1]);
            wait(NULL);
      }
      close(fd);
      free(buffer);
      return 0;
}

int main(int argc, char **argv)
{
      char *type = NULL, *directory = NULL, *filename = NULL;
      int o;

      while ((o = getopt(argc, argv, "dt:D:")) > 0) {
            switch (o) {
            case 'd':
                  ++debug;
                  break;
            case 'h':
                  usage();
                  return 0;
            case 't':
                  type = optarg;
                  break;
            case 'D':
                  directory = optarg;
                  break;
            default:
                  usage();
                  return 1;
            }
      }

      if (optind == argc-1)
            filename = argv[optind++];
      if (optind != argc ||
            argc == 1 ||
            (type && !directory) ||
            (!type && directory) ||
            (filename && type)  ||
            !(filename || type)) {
            usage();
            return 1;
      }

      if (type)
            return talk_to_sal(type, directory);
      else
            return decode_a_file(filename);
}

Generated by  Doxygen 1.6.0   Back to index