From da0a582564da3a0afeaa17336a025876a925db0e Mon Sep 17 00:00:00 2001 From: Steve Newman Date: Tue, 2 Jun 2009 12:48:18 -0700 Subject: [PATCH] Fixes to URL functionality: - Extend http.URLUnescape to convert '+' to space - Add http.URLEscape - Rename URL.Query to EncodedQuery (and stop decoding it, as decoding this field before separating key/value pairs loses important information) - Report a clean error on incomplete hex escapes - Update existing tests, add new ones APPROVED=rsc DELTA=293 (256 added, 3 deleted, 34 changed) OCL=29685 CL=29759 --- src/lib/http/url.go | 117 +++++++++++++++++++---- src/lib/http/url_test.go | 200 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 285 insertions(+), 32 deletions(-) diff --git a/src/lib/http/url.go b/src/lib/http/url.go index db51d99aa2..0325b04eed 100644 --- a/src/lib/http/url.go +++ b/src/lib/http/url.go @@ -41,37 +41,60 @@ func unhex(c byte) byte { return 0 } +// Return true if the specified character should be escaped when appearing in a +// URL string. +// +// TODO: for now, this is a hack; it only flags a few common characters that have +// special meaning in URLs. That will get the job done in the common cases. +func shouldEscape(c byte) bool { + switch c { + case ' ', '?', '&', '=', '#', '+', '%': + return true; + } + return false; +} + // URLUnescape unescapes a URL-encoded string, -// converting %AB into the byte 0xAB. +// converting %AB into the byte 0xAB and '+' into ' ' (space). // It returns a BadURL error if any % is not followed // by two hexadecimal digits. func URLUnescape(s string) (string, os.Error) { // Count %, check that they're well-formed. n := 0; + anyPlusses := false; for i := 0; i < len(s); { - if s[i] == '%' { + switch s[i] { + case '%': n++; - if !ishex(s[i+1]) || !ishex(s[i+2]) { + if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) { return "", BadURL{"invalid hexadecimal escape"} } - i += 3 - } else { + i += 3; + case '+': + anyPlusses = true; + i++; + default: i++ } } - if n == 0 { + if n == 0 && !anyPlusses { return s, nil } t := make([]byte, len(s)-2*n); j := 0; for i := 0; i < len(s); { - if s[i] == '%' { + switch s[i] { + case '%': t[j] = unhex(s[i+1]) << 4 | unhex(s[i+2]); j++; i += 3; - } else { + case '+': + t[j] = ' '; + j++; + i++; + default: t[j] = s[i]; j++; i++; @@ -80,9 +103,53 @@ func URLUnescape(s string) (string, os.Error) { return string(t), nil; } +// URLEscape converts a string into URL-encoded form. +func URLEscape(s string) string { + spaceCount, hexCount := 0, 0; + for i := 0; i < len(s); i++ { + c := s[i]; + if (shouldEscape(c)) { + if (c == ' ') { + spaceCount++; + } else { + hexCount++; + } + } + } + + if spaceCount == 0 && hexCount == 0 { + return s; + } + + t := make([]byte, len(s)+2*hexCount); + j := 0; + for i := 0; i < len(s); i++ { + c := s[i]; + if !shouldEscape(c) { + t[j] = s[i]; + j++; + } else if (c == ' ') { + t[j] = '+'; + j++; + } else { + t[j] = '%'; + t[j+1] = "0123456789abcdef"[c>>4]; + t[j+2] = "0123456789abcdef"[c&15]; + j += 3; + } + } + return string(t); +} + // A URL represents a parsed URL (technically, a URI reference). // The general form represented is: // scheme://[userinfo@]host/path[?query][#fragment] +// The Raw, RawPath, and RawQuery fields are in "wire format" (special +// characters must be hex-escaped if not meant to have special meaning). +// All other fields are logical values; '+' or '%' represent themselves. +// +// Note, the reason for using wire format for the query is that it needs +// to be split into key/value pairs before decoding. type URL struct { Raw string; // the original string Scheme string; // scheme @@ -91,7 +158,7 @@ type URL struct { Userinfo string; // userinfo Host string; // host Path string; // /path - Query string; // query + RawQuery string; // query Fragment string; // fragment } @@ -156,10 +223,7 @@ func ParseURL(rawurl string) (url *URL, err os.Error) { // RFC 2396: a relative URI (no scheme) has a ?query, // but absolute URIs only have query if path begins with / if url.Scheme == "" || len(path) > 0 && path[0] == '/' { - path, url.Query = split(path, '?', true); - if url.Query, err = URLUnescape(url.Query); err != nil { - return nil, err - } + path, url.RawQuery = split(path, '?', true); } // Maybe path is //authority/path @@ -180,6 +244,21 @@ func ParseURL(rawurl string) (url *URL, err os.Error) { return nil, err } + // Remove escapes from the Authority and Userinfo fields, and verify + // that Scheme and Host contain no escapes (that would be illegal). + if url.Authority, err = URLUnescape(url.Authority); err != nil { + return nil, err + } + if url.Userinfo, err = URLUnescape(url.Userinfo); err != nil { + return nil, err + } + if (strings.Index(url.Scheme, "%") >= 0) { + return nil, BadURL{"hexadecimal escape in scheme"} + } + if (strings.Index(url.Host, "%") >= 0) { + return nil, BadURL{"hexadecimal escape in host"} + } + return url, nil } @@ -200,7 +279,7 @@ func ParseURLReference(rawurlref string) (url *URL, err os.Error) { // // There are redundant fields stored in the URL structure: // the String method consults Scheme, Path, Host, Userinfo, -// Query, and Fragment, but not RawPath or Authority. +// RawQuery, and Fragment, but not Raw, RawPath or Authority. func (url *URL) String() string { result := ""; if url.Scheme != "" { @@ -209,16 +288,16 @@ func (url *URL) String() string { if url.Host != "" || url.Userinfo != "" { result += "//"; if url.Userinfo != "" { - result += url.Userinfo + "@"; + result += URLEscape(url.Userinfo) + "@"; } result += url.Host; } - result += url.Path; - if url.Query != "" { - result += "?" + url.Query; + result += URLEscape(url.Path); + if url.RawQuery != "" { + result += "?" + url.RawQuery; } if url.Fragment != "" { - result += "#" + url.Fragment; + result += "#" + URLEscape(url.Fragment); } return result; } diff --git a/src/lib/http/url_test.go b/src/lib/http/url_test.go index f5a7069aea..8d8fabad5f 100644 --- a/src/lib/http/url_test.go +++ b/src/lib/http/url_test.go @@ -20,6 +20,7 @@ import ( type URLTest struct { in string; out *URL; + roundtrip string; // expected result of reserializing the URL; empty means same as "in". } var urltests = []URLTest { @@ -31,7 +32,8 @@ var urltests = []URLTest { "http", "//www.google.com", "www.google.com", "", "www.google.com", "", "", "" - } + }, + "" }, // path URLTest{ @@ -41,7 +43,19 @@ var urltests = []URLTest { "http", "//www.google.com/", "www.google.com", "", "www.google.com", "/", "", "" - } + }, + "" + }, + // path with hex escaping... note that space roundtrips to + + URLTest{ + "http://www.google.com/file%20one%26two", + &URL{ + "http://www.google.com/file%20one%26two", + "http", "//www.google.com/file%20one%26two", + "www.google.com", "", "www.google.com", + "/file one&two", "", "" + }, + "http://www.google.com/file+one%26two" }, // user URLTest{ @@ -51,7 +65,19 @@ var urltests = []URLTest { "ftp", "//webmaster@www.google.com/", "webmaster@www.google.com", "webmaster", "www.google.com", "/", "", "" - } + }, + "" + }, + // escape sequence in username + URLTest{ + "ftp://john%20doe@www.google.com/", + &URL{ + "ftp://john%20doe@www.google.com/", + "ftp", "//john%20doe@www.google.com/", + "john doe@www.google.com", "john doe", "www.google.com", + "/", "", "" + }, + "ftp://john+doe@www.google.com/" }, // query URLTest{ @@ -61,7 +87,19 @@ var urltests = []URLTest { "http", "//www.google.com/?q=go+language", "www.google.com", "", "www.google.com", "/", "q=go+language", "" - } + }, + "" + }, + // query with hex escaping: NOT parsed + URLTest{ + "http://www.google.com/?q=go%20language", + &URL{ + "http://www.google.com/?q=go%20language", + "http", "//www.google.com/?q=go%20language", + "www.google.com", "", "www.google.com", + "/", "q=go%20language", "" + }, + "" }, // path without /, so no query parsing URLTest{ @@ -70,8 +108,9 @@ var urltests = []URLTest { "http:www.google.com/?q=go+language", "http", "www.google.com/?q=go+language", "", "", "", - "www.google.com/?q=go+language", "", "" - } + "www.google.com/?q=go language", "", "" + }, + "http:www.google.com/%3fq%3dgo+language" }, // non-authority URLTest{ @@ -81,7 +120,8 @@ var urltests = []URLTest { "mailto", "/webmaster@golang.org", "", "", "", "/webmaster@golang.org", "", "" - } + }, + "" }, // non-authority URLTest{ @@ -91,7 +131,8 @@ var urltests = []URLTest { "mailto", "webmaster@golang.org", "", "", "", "webmaster@golang.org", "", "" - } + }, + "" }, } @@ -103,7 +144,8 @@ var urlnofragtests = []URLTest { "http", "//www.google.com/?q=go+language#foo", "www.google.com", "", "www.google.com", "/", "q=go+language#foo", "" - } + }, + "" }, } @@ -115,7 +157,18 @@ var urlfragtests = []URLTest { "http", "//www.google.com/?q=go+language", "www.google.com", "", "www.google.com", "/", "q=go+language", "foo" - } + }, + "" + }, + URLTest{ + "http://www.google.com/?q=go+language#foo%26bar", + &URL{ + "http://www.google.com/?q=go+language", + "http", "//www.google.com/?q=go+language", + "www.google.com", "", "www.google.com", + "/", "q=go+language", "foo&bar" + }, + "" }, } @@ -123,7 +176,7 @@ var urlfragtests = []URLTest { func ufmt(u *URL) string { return fmt.Sprintf("%q, %q, %q, %q, %q, %q, %q, %q, %q", u.Raw, u.Scheme, u.RawPath, u.Authority, u.Userinfo, - u.Host, u.Path, u.Query, u.Fragment); + u.Host, u.Path, u.RawQuery, u.Fragment); } func DoTest(t *testing.T, parse func(string) (*URL, os.Error), name string, tests []URLTest) { @@ -158,8 +211,12 @@ func DoTestString(t *testing.T, parse func(string) (*URL, os.Error), name string continue; } s := u.String(); - if s != tt.in { - t.Errorf("%s(%q).String() == %q", tt.in, s); + expected := tt.in; + if len(tt.roundtrip) > 0 { + expected = tt.roundtrip; + } + if s != expected { + t.Errorf("%s(%q).String() == %q (expected %q)", name, tt.in, s, expected); } } } @@ -172,3 +229,120 @@ func TestURLString(t *testing.T) { DoTestString(t, ParseURLReference, "ParseURLReference", urlfragtests); DoTestString(t, ParseURLReference, "ParseURLReference", urlnofragtests); } + +type URLEscapeTest struct { + in string; + out string; + err os.Error; +} + +var unescapeTests = []URLEscapeTest { + URLEscapeTest{ + "", + "", + nil + }, + URLEscapeTest{ + "abc", + "abc", + nil + }, + URLEscapeTest{ + "1%41", + "1A", + nil + }, + URLEscapeTest{ + "1%41%42%43", + "1ABC", + nil + }, + URLEscapeTest{ + "%4a", + "J", + nil + }, + URLEscapeTest{ + "%6F", + "o", + nil + }, + URLEscapeTest{ + "%", // not enough characters after % + "", + BadURL{"invalid hexadecimal escape"} + }, + URLEscapeTest{ + "%a", // not enough characters after % + "", + BadURL{"invalid hexadecimal escape"} + }, + URLEscapeTest{ + "%1", // not enough characters after % + "", + BadURL{"invalid hexadecimal escape"} + }, + URLEscapeTest{ + "123%45%6", // not enough characters after % + "", + BadURL{"invalid hexadecimal escape"} + }, + URLEscapeTest{ + "%zz", // invalid hex digits + "", + BadURL{"invalid hexadecimal escape"} + }, +} + +func TestURLUnescape(t *testing.T) { + for i, tt := range unescapeTests { + actual, err := URLUnescape(tt.in); + if actual != tt.out || (err != nil) != (tt.err != nil) { + t.Errorf("URLUnescape(%q) = %q, %s; want %q, %s", tt.in, actual, err, tt.out, tt.err); + } + } +} + +var escapeTests = []URLEscapeTest { + URLEscapeTest{ + "", + "", + nil + }, + URLEscapeTest{ + "abc", + "abc", + nil + }, + URLEscapeTest{ + "one two", + "one+two", + nil + }, + URLEscapeTest{ + "10%", + "10%25", + nil + }, + URLEscapeTest{ + " ?&=#+%!", + "+%3f%26%3d%23%2b%25!", + nil + }, +} + +func TestURLEscape(t *testing.T) { + for i, tt := range escapeTests { + actual := URLEscape(tt.in); + if tt.out != actual { + t.Errorf("URLEscape(%q) = %q, want %q", tt.in, actual, tt.out); + } + + // for bonus points, verify that escape:unescape is an identity. + roundtrip, err := URLUnescape(actual); + if roundtrip != tt.in || err != nil { + t.Errorf("URLUnescape(%q) = %q, %s; want %q, %s", actual, roundtrip, err, tt.in, "[no error]"); + } + } +} +