http-redirect: Minimal awk script to redirect http to https

A simple script written in awk to parse HTTP/1.1 requests and then
generate a reply to use the https:// URL.

Only support GET and HEAD methods; others will get Not Implemented.

Recognises multi-line headers, repeated headers, and absolute URI
(where the host is in the URI and the Host: header is ignored).
Supports host headers that include a port for non-standard redirect.
Decodes %xx url encoding before path lookup.

Includes systemd socket and service files.  Includes unit tests.

Includes trace subroutine that writes trace to a file if tracefile
variable set (either awk -v or additional awk -f segment).

Includes add-url.awk that adds url {request, destination} pairs
read from a file agumenting the valid URI array.

Prior version testing:
	Tested on withspoon with awk script in /home/root/redirect.awk
         * ran Unit tests script
       	 * systemd units installed in /run/systemd/system/
         * curl -w %{redirect_url}'\n' http://hostname.example/

Tests:
	test cases in ./tests show location is parsed.
	with mawk 1.3.3
		nc -l -p 8080 < fifo | awk -f redirect.awk -Wi > fifo
		firefox http://localhost:8080
	with gawk 4.0.1
		nc -l -p 8080 -c "awk -f http-redirect.awk" &
		curl -v -k -L http://localhost:8080/
		curl -v -I -k -L http://localhost:8080/

Change-Id: Ic361247dedfd7bcff620d793dd7082c247c9e78e
Signed-off-by: Milton Miller <miltonm@us.ibm.com>
diff --git a/http-redirect/add-urls.awk b/http-redirect/add-urls.awk
new file mode 100644
index 0000000..768fb8c
--- /dev/null
+++ b/http-redirect/add-urls.awk
@@ -0,0 +1,8 @@
+# Add this fragment to populate the https_resources array when urlfile is set
+# To use: awk -f $thisfile -v urlfile=
+# url contains  lookup redirect
+BEGIN {
+if (urlfile)
+	while ((getline < urlfile) > 0)
+		https_resources[$1] = $2
+}
diff --git a/http-redirect/http-redirect.awk b/http-redirect/http-redirect.awk
new file mode 100644
index 0000000..513c356
--- /dev/null
+++ b/http-redirect/http-redirect.awk
@@ -0,0 +1,465 @@
+#!/usr/bin/awk -f
+#
+# A Minimal HTTP/1.1 server to redirect http URIs to https
+
+BEGIN {
+	CRLF = "\r\n"
+	dquote = "\""
+
+	methods["GET"] = 1
+	methods["HEAD"] = 1
+
+	errors[400] = "400 Bad Request"
+	errors[404] = "404 Not Found"
+	errors[500] = "500 Internal Server Error"
+	errors[501] = "501 Not Implemented"
+	errors[505] = "505 HTTP Version Not Supported"
+	msgtxt[505] = "HTTP/1.1 only"
+
+	# Only forward these resources to the designated paths over https
+	https_resources["/"] = "/"
+}
+
+# Strip trailing CR(\r) before LF(\n)  RFC2616 19.3
+/\r$/ { sub(/\r$/, "") }
+
+# The first line is the HTTP request.
+method == "" {
+	if ($0 == "")
+		next
+
+	method = $1
+	request_uri = $2
+	version = $3
+
+	validate_request()
+
+	# headers start on the next line
+	next
+}
+
+# a header continuation line RFC2616 4.2
+/^[ \t]+/ {
+	# Replace leading, trailing whitespace with space below
+	sub(/[ \t]*$/, "")
+	sub(/^[ \t]*/, "")
+	trace("extend header >"header"< with content >"$0"<")
+
+	headers[header] = headers[header] " " $0
+	next
+}
+
+# Header lines start with a token and have a : seperator.  Implied LWS is
+# allowed around the : seperator.  LWS at the beginning and end can be removed.
+match($0, /[ \t]*:[ \t]*/) {
+	header = substr($0, 1, RSTART - 1)
+	content = substr($0, RSTART + RLENGTH)
+	sub(/[ \t]*$/, "", content)
+
+	# Field names are a single token.  LWS is impled allowed at the
+	# : seperator.  Any beginning or trailing LWS is not significant.
+	if (!is_token(header))
+		respond_error(400)
+
+	# Headers are case insensitive, so normalize token to upper case.
+	header = toupper(header)
+
+	# RFC2616 4.2 multiple instances of a headers is only valid for for
+	# comma separated lists.  Remove any trailing LWS, add ", " seperator.
+	prior = ""
+	if (header in headers)
+		prior = headers[header] ", "
+	headers[header] = prior content
+
+	trace("found header >"header"< with content >"headers[header]"<")
+
+	next
+}
+
+# A blank line marks the end of the headers.
+/^$/ {
+	# Could read request body here but we don't care.
+	trace("end of request headers")
+	validate_request()
+
+	validate_uri(request_uri, split_uri)
+	host = find_host()
+	path = split_uri["path"]
+	validate_path_and_respond(host, path)
+
+	next
+}
+
+# Should never get here: in headers a line without an indent nor a : is invalid.
+{
+	trace("Unparsed header line : >" $0 "<")
+
+	header = $0
+	headers[header] = ""
+
+	# check HTTP version before bad request error response
+	validate_request()
+	respond_error(400)
+	next
+}
+
+############################################################
+
+function validate_request()
+{
+	trace("version >"version"<")
+	trace("uri >"request_uri"<")
+	trace("method >"method"<")
+	if (version !~ /HTTP\/0*1[.][0-9]+$/)	# Support leading 0s, two halves
+		respond_error(505)		# Version Not Supported
+	if (bad_uric(request_uri))
+		respond_error(400)		# Bad Request (bogus encoding)
+	if (!method in methods)
+		respond_error(501)		# Not Implemented
+}
+
+function validate_uri(request_uri, split_uri)
+{
+	split_url_components(request_uri, split_uri)
+	trace(dump_split_url(split_uri))
+
+	if (!is_http_request_uri(split_uri))
+		respond_error(400)		# Bad Request (didn't parse)
+}
+
+function find_host()
+{
+	# RFC2616 5.2
+	if (!("HOST" in headers))
+		respond_error(400)
+
+	host = headers["HOST"]
+	trace("initial host is >" host "<")
+	if ("host" in split_uri)
+		host = split_uri["host"]
+	else if (match(host, /:[0-9]*$/))
+		# RFC 2616 14.23  Host header is host:port of URI
+		# RFC 2616 3.2.2 port may be not given or empty
+		host = substr(host, 1, RSTART - 1)
+	trace("prioritized host is >" host "<")
+
+	# A very relaxed check for domainlabel or IPv4.
+	if (host !~ /^[0-9a-zA-Z.-]+$/)
+		respond_error(400)
+	trace("host passed regex")
+
+	return host
+}
+
+function validate_path_and_respond(host, path)
+{
+	lookup = unescape(path)
+
+	# URIs must be unescaped before compare, but forwarded unmodified
+	trace("lookup path is >" lookup "<")
+
+	# Translate our whitelisted URI
+	if (lookup in https_resources) {
+		newpath = "https://" host https_resources[lookup]
+		trace("Redirecting to >" newpath "<\n")
+		response = "308 Permanent Redirect"
+		reason = "Access with a https:// URL"
+		content = response CRLF newpath CRLF CRLF reason CRLF
+		respond_and_exit(response, content, newpath)
+	}
+
+	# Rather than be an open redirector, return Not Found
+	respond_error(404)			# Not Found
+
+	# get noisy response if we didn't exit above
+	trace("Failed to exit after response!")
+	exit 3
+}
+
+function is_token(token)
+{
+	# US ASCII (0-127) excluding CTL (000-037, 177, SP (040), seperators
+	if (match(token, /[^\041-\176]/) ||
+		match(token, /[()<>@,;:\/[]?=\{\}" \t/))
+		return 0
+
+	return 1
+}
+
+# unreserved, reserved, or encoded.
+function bad_uric(URI)
+{
+	# hide encoded
+	gsub(/%[0-9a-fA-F][0-9a-fA-F]/, "", URI)
+
+	# fail if remaining characters are not in (mark alpha numeric reserved)
+	if (URI ~ /[^-_.!~*'()a-zA-Z0-9";\/?:@&=+$,]/)
+		return 1
+	return 0
+}
+
+# We only expect a few chars so call index vs building table hex2int[chr]
+function hex2dec(chr)
+{
+	v = index("0123456789abcdef", tolower(chr))
+	if (v)
+		return v - 1
+
+	trace("bad hex2dec character >" chr "<")
+	# bad_uric should have caught input
+	respond_error(500)			# Internal Server Error
+}
+
+# Do % hex hex -> code replacement
+function unescape(input,  out)
+{
+	i = index(input, "%")
+
+	if (i == 0)
+		return input
+
+	out = ""
+	while (i) {
+		code = (hex2dec(substr(input, i + 1, 1)) * 16 + \
+			hex2dec(substr(input, i + 2, 1)))
+		out = out substr(input, 1, i - 1) sprintf("%c", code)
+		input = substr(input, i + 3)
+		i = index(input, "%")
+	}
+	return out input
+}
+
+# With cues from RFC2396 appendix B etal
+function split_url_components(url, components)
+{
+	if (match(url, /#/)) {
+		components["frag"] = substr(url, RSTART + 1)
+		url = substr(url, 1, RSTART - 1)
+	}
+
+	if (match(url, /\?/)) {
+		components["query"] = substr(url, RSTART + 1)
+		url = substr(url, 1, RSTART - 1)
+	}
+
+	if (match(url, /^[^:\/?#]+:/)) {
+		components["scheme"] = substr(url, 1, RLENGTH - 1) ;
+		url = substr(url, RLENGTH + 1)
+	}
+
+	# Maybe return early:  Separate the path from the authority.
+	if (substr(url, 1, 2) != "//") {
+		components["path"] = url;
+		return
+	} else if (match(substr(url, 3), "/")) {
+		components["path"] = substr(url, 3 + RSTART - 1) # include the /
+		url = substr(url, 3, RSTART - 1)
+	} else {
+		url = substr(url, 3)
+	}
+
+	# Parse userinfo@host:port
+	if (match(url, /@/)) {
+		userinfo = substr(url, 1, RSTART - 1)
+		url = substr(url, RSTART + 1)
+
+		components["userinfo"] = userinfo
+		if (match(userinfo, ":")) {
+			# NOT RECOMMENDED
+			components["password"] = substr(userinfo, RSTART + 1)
+			userinfo = substr(userinfo, RSTART - 1)
+		}
+		components["user"] = userinfo;
+	}
+	if (match(url, ":")) {
+		# port is numeric or empty
+		components["port"] = substr(url, RSTART + 1)
+		url = substr(url, 1, RSTART - 1)
+	}
+	if (url)
+		components["host"] = url
+}
+
+function dump_field_if_present(key, array)
+{
+	r=""
+	if (key in array)
+		r=sprintf(dquote key dquote": "dquote"%s"dquote"\n", array[key])
+	return r
+}
+
+function dump_split_url(components)
+{
+	r= "split_url = {\n"
+	r=r dump_field_if_present("scheme", components)
+	r=r dump_field_if_present("userinfo", components)
+	r=r dump_field_if_present("host", components)
+	r=r dump_field_if_present("port", components)
+	r=r dump_field_if_present("path", components)
+	r=r dump_field_if_present("query", components)
+	r=r dump_field_if_present("frag", components)
+	r=r "}\n"
+
+	return r
+}
+
+# RFC2616 3.2.2
+function is_http_request_uri(split_url)
+{
+	# Fragments are handled by the client, user info is not on the wire.
+	if (("frag" in split_url) || ("userinfo" in split_url))
+		return 0
+	trace("not frag, no user")
+
+	# If absoluteURI, it will have both, if abs_path neither
+	if (("scheme" in split_url) != ("host" in split_url))
+		return 0
+	trace("scheme host ok")
+
+	if ("scheme" in split_url) {
+		trace("original scheme is:  >" split_url["scheme"] "<")
+		scheme = unescape(split_url["scheme"])
+		trace("unescaped scheme is: >" scheme "<")
+		# HTTP 2616 3.2.3 scheme MUST be case insensitive
+		if (tolower(scheme) != "http")
+			return 0
+		trace("scheme is http")
+
+		# 3.2.2 http always has a net_url host authority, host not empty
+		if (!("host" in split_url))
+			return 0
+		trace("host present >" split_url["host"] "<")
+
+		# Authority name not empty
+		if (split_url["host"] == "")
+			return 0
+
+		# 2616 3.2.3 empty path is /    sole fixup: scheme://hostport
+		if (split_url["path"] == "")
+			split_url["path"] = "/"
+	}
+
+	trace("path is now >" split_url["path"] "<")
+	trace("first path char is >" substr(split_url["path"], 1, 1) "<")
+
+	# The path must be absolute.
+	return substr(split_url["path"], 1, 1) == "/"
+}
+
+function location_header_ok(URI)
+{
+	# policy: all response URLs shall be https
+	if (substr(URI, 1, 8) != "https://")
+		return 0
+
+	# The URL shall have been encoded
+	if (bad_uric(URI))
+		return 0
+
+	return 1
+}
+
+function response_needs_location(response)
+{
+	return (response ~ /^3/) || (response ~ /^201/)
+}
+
+function respond_and_exit(response, content, URI)
+{
+	# If the URI is given validate it should be sent and prepare header
+	if (location_header_ok(URI) && response_needs_location(response))
+		location = CRLF "Location: " URI
+	else
+		location = ""
+
+	if (response !~ /^[1-5][0-9][0-9] /) {
+		trace( "DEBUG: response '" response "'\n" )
+		trace( "DEBUG: content: '" content"'\n" )
+		response = "500 Internal Server Error"
+		content = response CRLF
+	}
+
+	content_length = sprintf("Content-Length: %d", length(content))
+
+	# RFC 2616 9.4 HEAD MUST NOT return message body.
+	if (method == "HEAD") {
+		content = ""
+	}
+
+	# Final trace before changing line endings visual seperation
+	trace("")
+
+	# Respond with protocol and response, prepared location from above,
+	# and then the fixed response headers.
+
+	# Separate header lines with CRLF but add nothing after the body
+	OFS = CRLF
+	ORS = ""
+
+	print( "HTTP/1.1 " response location,
+		content_length,
+		"Content-Type: text/plain; charset=UTF-8",
+		"X_Frame_Options: DENY",
+		"Pragma: no-cache",
+		"Cache_Control: no-Store,no-Cache",
+		"X-XSS-Protection: 1; mode=block",
+		"X-Content-Type-Options: nosniff",
+		"Connection: close",
+		"",
+		content)
+
+	# We told client to close the connection; also close this end.
+	exit 0
+}
+
+# Respond with an error and close the connection to avoid synchronization.
+function respond_error(num)
+{
+	if (num in errors)
+		if (num in msgtxt)
+			respond_and_exit(errors[num], msgtxt[num] CRLF)
+		else
+			respond_and_exit(errors[num], errors[num] CRLF)
+	else
+		respond_and_exit(errors[500], "unknown error number " num CRLF)
+}
+
+# To generate a trace, set the tracefile or tracecmd variable with awk -v
+function trace(string)
+{
+	if (tracefile)
+		print(string) > tracefile
+	if (tracecmd)
+		print(string) | tracecmd
+}
+
+
+
+###########################################################
+
+# BEGIN {
+# # The character classes as defined in rfc 2396
+# reserved = ";/?:@&=+$,"
+# mark = "-_.!~*'()"
+# digit = "0123456789"
+# lower = "abcdefghijklmnopqrstuvwxyz"
+# upper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+# unreserved = lower upper digit mark
+#
+# control = 00-1F, 7F
+# space = " "
+# delims = "<>#%" dquote
+# unwise = "{}|\^[]`"
+# }
+
+################################################################
+
+# Build a table to convert a hex character to an integer
+function make_hex2int(hex2int) {
+	for(i =0; i < 10; i++)
+		hex2int[i] = i
+	for (i=10 ; i < 16; i++) {
+		hex2int[substr("ABCDEF", i - 10 + 1, 1)] = i
+		hex2int[substr("abcdef", i - 10 + 1, 1)] = i
+	}
+}
diff --git a/http-redirect/http-redirect.socket b/http-redirect/http-redirect.socket
new file mode 100644
index 0000000..f0f3302
--- /dev/null
+++ b/http-redirect/http-redirect.socket
@@ -0,0 +1,9 @@
+[Unit]
+Description=Redirect HTTP to HTTPS socket
+
+[Socket]
+ListenStream=80
+Accept=yes
+
+[Install]
+WantedBy=sockets.target
diff --git a/http-redirect/http-redirect@.service b/http-redirect/http-redirect@.service
new file mode 100644
index 0000000..19fa21b
--- /dev/null
+++ b/http-redirect/http-redirect@.service
@@ -0,0 +1,6 @@
+[Unit]
+Description=Redirect HTTP to HTTPS Per-connection
+
+[Service]
+ExecStart=/usr/bin/awk -f /usr/bin/http-redirect.awk
+StandardInput=socket
diff --git a/http-redirect/tests b/http-redirect/tests
new file mode 100755
index 0000000..a11031f
--- /dev/null
+++ b/http-redirect/tests
@@ -0,0 +1,104 @@
+#!/bin/sh -e
+
+set -e
+
+TMPFILE="out.$$"
+URLFILE="urls.$$"
+# TRACEFILE="/dev/tty"
+
+# create temp files
+rm -f $TMPFILE $URLFILE && touch $TMPFILE &&
+trap "rm ./$TMPFILE ./$URLFILE" 0 || exit 2
+echo "/over/the/rainbow /over/the/rainbow" > $URLFILE
+
+# build up the command line
+# for mawk, add -Wi or -W interactive to avoid buffered read on fifo
+overrides="-v urlfile=$URLFILE -f add-urls.awk -v tracefile=$TRACEFILE"
+command="awk -f ./http-redirect.awk $overrides"
+
+
+# remember the CR in your expect
+
+test="1 test absolute URI"
+expect='^location: https://somewhere.example.com/over/the/rainbow.$'
+$command << HERE > $TMPFILE
+GET http://somewhere.example.com/over/the/rainbow HTTP/1.1
+Host: elsewhere.example
+
+HERE
+
+if grep -is "$expect" $TMPFILE
+then
+ echo PASS $test
+else
+ echo FAIL $test
+ echo "Expected to find >'$expect'< in :"
+ cat $TMPFILE
+ false
+fi
+
+test="2 Test no absolute-path in URI"
+expect='^location: https://somewhere.example/.$'
+$command << HERE > $TMPFILE
+GET http://somewhere.example HTTP/1.1
+Host: elsewhere.example
+
+HERE
+
+if grep -is "$expect" $TMPFILE
+then
+ echo PASS $test
+else
+ echo FAIL $test
+ echo "Expected to find >'$expect'< in :"
+ cat $TMPFILE
+ false
+fi
+
+
+test="3 test generic 1.1 client"
+expect='^location: https://elsewhere.example/over/the/rainbow.$'
+$command << HERE > $TMPFILE
+GET /over/the/rainbow HTTP/1.1
+Host: elsewhere.example
+
+HERE
+
+if grep -is "$expect" $TMPFILE
+then
+ echo PASS $test
+else
+ echo FAIL $test
+ echo "Expected to find >'$expect'< in :"
+ cat $TMPFILE
+ false
+fi
+
+
+
+test="4 test generic 1.1 client"
+expect='^location: https://somewhere.com/over/the/rainbow.$'
+$command << HERE > $TMPFILE
+GET /over/the/rainbow HTTP/1.1
+not-host: elsewhere.example
+x-host: elsewhere.example.com
+ ( comment )
+host: somewhere.com
+host2: else.where.example.com
+
+HERE
+
+
+if grep -is "$expect" $TMPFILE
+then
+ echo PASS $test
+else
+ echo FAIL $test
+ echo "Expected to find >'$expect'< in :"
+ cat $TMPFILE
+ false
+fi
+
+
+
+echo all tests passed