Add --trace support (in blktrace format)

In an effort understand what PNOR requests come from the host, it'd be
good to be able to trace what requests come in and visualise them.
blktrace is some Linux infrastructure for tracing block device activity
all the way through the linux block layer, for which there is a variety
of existing tooling. These tools process the (typically) kernel produced
blktrace output. We can produce this same output programatically from
mboxd though.

This patch gives us the (option) to start mboxd in a mode where it will
write a blktrace file out, which can be fed into tools like blkparse(1)
or tools like iowatcher[1] to generate charts (and video).

A quirk of the blktrace format is that it's very geared towards a full
IO subsystem, so we can't directly map window operations (what we know
in mboxd) to specific IO ops (i.e. we don't get "firmware read one page
out of this window before closing it"). So, for each Window opening (or
reusing a cached one), we write THREE blktrace events: a Queue,
Dispatch, and Complete.

We can usk tools like blkparse to do everything from get a detailed list
of what windows were opened and for how long:

  0,0    0        1     0.000000000     0  Q   R 0 + 8 [(null)]
  0,0    0        2     0.000000000     0  D   R 0 + 8 [(null)]
  0,0    0        3     0.000182022     0  C   R 0 + 8 [0]
  0,0    0        4     0.042416351     0  Q   R 4144 + 2040 [(null)]
  0,0    0        5     0.042416351     0  D   R 4144 + 2040 [(null)]
  0,0    0        6     0.060802662     0  C   R 4144 + 2040 [0]
  0,0    0        7     0.084775813     0  Q   R 64 + 288 [(null)]
  0,0    0        8     0.084775813     0  D   R 64 + 288 [(null)]
  0,0    0        9     0.087835720     0  C   R 64 + 288 [0]
  0,0    0       10     1.429234244     0  Q   R 8488 + 2048 [(null)]

to getting a simple summary at the end of how many windows were opened
read and read/write:

CPU0 (0,0):
 Reads Queued:          90,   74,040KiB	 Writes Queued:           6,    2,664KiB
 Read Dispatches:       90,   74,040KiB	 Write Dispatches:        6,    2,664KiB
 Reads Requeued:         0		 Writes Requeued:         0
 Reads Completed:       90,   74,040KiB	 Writes Completed:        6,    2,664KiB
 Read Merges:            0,        0KiB	 Write Merges:            0,        0KiB
 Read depth:             1        	 Write depth:             1
 IO unplugs:             0        	 Timer unplugs:           0

If you change the window size to something tiny, like 4096 bytes, you
can get detailed paging information for hostboot at the expense of IPL
time.

Pretty graphs and animations:
https://www.flamingspork.com/blog/?p=4419

[1] iowatcher: http://masoncoding.com/iowatcher/

Change-Id: I5dd02b6bc616c441abf54d87a5d67c972cbaf228
Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
[AJ: Resolve merge conflicts, some tidy ups]
Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
diff --git a/mboxd.c b/mboxd.c
index 2026023..3f78410 100644
--- a/mboxd.c
+++ b/mboxd.c
@@ -54,7 +54,8 @@
 	"\t\t\t\t(default: fill the reserved memory region)\n"
 	"\t-w | --window-size\tThe window size (power of 2) in MB\n"
 	"\t\t\t\t(default: 1MB)\n"
-	"\t-f | --flash\t\tSize of flash in [K|M] bytes\n\n";
+	"\t-f | --flash\t\tSize of flash in [K|M] bytes\n\n"
+	"\t-t | --trace\t\tFile to write trace data to (in blktrace format)\n\n";
 
 static int dbus_init(struct mbox_context *context,
 		     const struct transport_ops **ops)
@@ -236,6 +237,7 @@
 		{ "window-num",		optional_argument,	0, 'n' },
 		{ "verbose",		no_argument,		0, 'v' },
 		{ "syslog",		no_argument,		0, 's' },
+		{ "trace",		optional_argument,	0, 't' },
 		{ "version",		no_argument,		0, 'V' },
 		{ "help",		no_argument,		0, 'h' },
 		{ 0,			0,			0, 0   }
@@ -309,6 +311,17 @@
 		case 'V':
 			printf("%s V%s\n", THIS_NAME, PACKAGE_VERSION);
 			exit(0);
+		case 't':
+			context->blktracefd = open(argv[optind],
+						   O_CREAT|O_TRUNC|O_WRONLY,
+						   0666);
+			MSG_INFO("Recording blktrace output to %s\n",
+				 argv[optind]);
+			if (context->blktracefd == -1) {
+				perror("Couldn't open blktrace file for writing");
+				exit(2);
+			}
+			break;
 		case 'h':
 			return false; /* This will print the usage message */
 		default:
diff --git a/mboxd.h b/mboxd.h
index 96235d7..6b14a24 100644
--- a/mboxd.h
+++ b/mboxd.h
@@ -5,6 +5,7 @@
 #define MBOX_H
 
 #include <assert.h>
+#include <linux/blktrace_api.h>
 #include <mtd/mtd-abi.h>
 #include <systemd/sd-bus.h>
 #include <poll.h>
@@ -101,6 +102,11 @@
 	uint32_t mem_size;
 	/* LPC Bus Base Address (bytes) */
 	uint32_t lpc_base;
+
+	/* Tracing */
+	int blktracefd;
+	struct blk_io_trace trace;
+	int64_t blktrace_start;
 };
 
 #endif /* MBOX_H */
diff --git a/protocol.c b/protocol.c
index 7158bfc..8f2f742 100644
--- a/protocol.c
+++ b/protocol.c
@@ -5,6 +5,7 @@
 #include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <unistd.h>
 
 #include "backend.h"
 #include "common.h"
@@ -13,6 +14,7 @@
 #include "protocol.h"
 #include "windows.h"
 
+
 #define BLOCK_SIZE_SHIFT_V1		12 /* 4K */
 
 static inline uint8_t protocol_get_bmc_event_mask(struct mbox_context *context)
@@ -155,6 +157,108 @@
 	return lpc_addr >> context->backend.block_size_shift;
 }
 
+static inline int64_t blktrace_gettime(void)
+{
+	struct timespec ts;
+	int64_t n;
+
+	clock_gettime(CLOCK_REALTIME, &ts);
+	n = (int64_t)(ts.tv_sec) * (int64_t)1000000000 + (int64_t)(ts.tv_nsec);
+
+	return n;
+}
+
+static void blktrace_flush_start(struct mbox_context *context)
+{
+	struct blk_io_trace *trace = &context->trace;
+	struct timespec now;
+
+	if (!context->blktracefd)
+		return;
+
+	if (!context->blktrace_start) {
+		clock_gettime(CLOCK_REALTIME, &now);
+		context->blktrace_start = blktrace_gettime();
+	}
+
+	trace->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+	trace->sequence++;
+	trace->time = blktrace_gettime() - context->blktrace_start;
+	trace->sector = context->current->flash_offset / 512;
+	trace->bytes = context->current->size;
+	if (context->current_is_write)
+		trace->action = BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_WRITE);
+	else
+		trace->action = BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_READ);
+	trace->pid = 0;
+	trace->device = 0;
+	trace->cpu = 0;
+	trace->error = 0;
+	trace->pdu_len = 0;
+	write(context->blktracefd, &trace, sizeof(trace));
+	trace->sequence++;
+	trace->time = blktrace_gettime() - context->blktrace_start;
+	trace->action &= ~BLK_TA_QUEUE;
+	trace->action |= BLK_TA_ISSUE;
+	write(context->blktracefd, &trace, sizeof(trace));}
+
+static void blktrace_flush_done(struct mbox_context *context)
+{
+	struct blk_io_trace *trace = &context->trace;
+
+	if (!context->blktracefd)
+		return;
+
+	trace->sequence++;
+	trace->time = blktrace_gettime() - context->blktrace_start;
+	trace->action &= ~BLK_TA_ISSUE;
+	trace->action |= BLK_TA_COMPLETE;
+	write(context->blktracefd, &trace, sizeof(trace));
+}
+
+static void blktrace_window_start(struct mbox_context *context)
+{
+	struct blk_io_trace *trace = &context->trace;
+
+	if (!context->blktracefd)
+		return;
+
+	if (!context->blktrace_start)
+		context->blktrace_start = blktrace_gettime();
+
+	trace->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+	trace->sequence++;
+	trace->time = blktrace_gettime() - context->blktrace_start;
+	trace->action = BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_READ);
+	trace->pid = 0;
+	trace->device = 0;
+	trace->cpu = 0;
+	trace->error = 0;
+	trace->pdu_len = 0;
+}
+
+static void blktrace_window_done(struct mbox_context *context)
+{
+	struct blk_io_trace *trace = &context->trace;
+
+	if (!context->blktracefd)
+		return;
+
+	trace->sector = context->current->flash_offset / 512;
+	trace->bytes = context->current->size;
+	write(context->blktracefd, &trace, sizeof(trace));
+	trace->sequence++;
+	trace->action &= ~BLK_TA_QUEUE;
+	trace->action |= BLK_TA_ISSUE;
+	write(context->blktracefd, &trace, sizeof(trace));
+
+	trace->sequence++;
+	trace->time = blktrace_gettime() - context->blktrace_start;
+	trace->action &= ~BLK_TA_ISSUE;
+	trace->action |= BLK_TA_COMPLETE;
+	write(context->blktracefd, &trace, sizeof(trace));
+}
+
 static int protocol_v1_create_window(struct mbox_context *context,
 				     struct protocol_create_window *io)
 {
@@ -180,7 +284,9 @@
 		 * write_flush() to make sure we pick the right one.
 		 */
 		if (context->current_is_write) {
+			blktrace_flush_start(context);
 			rc = context->protocol->flush(context, NULL);
+			blktrace_flush_done(context);
 			if (rc < 0) {
 				MSG_ERR("Couldn't Flush Write Window\n");
 				return rc;
@@ -192,6 +298,7 @@
 	/* Offset the host has requested */
 	MSG_INFO("Host requested flash @ 0x%.8x\n", offset);
 	/* Check if we have an existing window */
+	blktrace_window_start(context);
 	context->current = windows_search(context, offset,
 					  context->version == API_VERSION_1);
 
@@ -206,6 +313,7 @@
 			return rc;
 		}
 	}
+	blktrace_window_done(context);
 
 	context->current_is_write = !io->req.ro;