From 26166cf5e0aeedd05fa2777bfe7730cfc2c917c8 Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Sun, 14 Jan 2007 23:48:31 +0000 Subject: [PATCH] Added a URI parser that should be standards conformant. (It can certainly handle something as convoluted as ftp://joe:secret@insecure.org:8081/hidden/path/to?what=is#this --- src/core/uri.c | 161 +++++++++++++++++++++++++++++++++++++++++ src/include/gpxe/uri.h | 116 +++++++++++++++++++++++++++++ 2 files changed, 277 insertions(+) create mode 100644 src/core/uri.c create mode 100644 src/include/gpxe/uri.h diff --git a/src/core/uri.c b/src/core/uri.c new file mode 100644 index 00000000..42367170 --- /dev/null +++ b/src/core/uri.c @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2007 Michael Brown . + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/** @file + * + * Uniform Resource Identifiers + * + */ + +#include +#include +#include +#include + +/** + * Parse URI + * + * @v uri_string URI as a string + * @ret uri URI + * + * Splits a URI into its component parts. The return URI structure is + * dynamically allocated and must eventually be freed by calling + * free_uri(). + */ +struct uri * parse_uri ( const char *uri_string ) { + struct uri *uri; + char *raw; + char *tmp; + char *path = NULL; + char *authority = NULL; + size_t raw_len; + + /* Allocate space for URI struct and a copy of the string */ + raw_len = ( strlen ( uri_string ) + 1 /* NUL */ ); + uri = malloc ( sizeof ( *uri ) + raw_len ); + if ( ! uri ) + return NULL; + raw = ( ( ( char * ) uri ) + sizeof ( *uri ) ); + + /* Zero URI struct and copy in the raw string */ + memset ( uri, 0, sizeof ( *uri ) ); + memcpy ( raw, uri_string, raw_len ); + + /* Start by chopping off the fragment, if it exists */ + if ( ( tmp = strchr ( raw, '#' ) ) ) { + *(tmp++) = '\0'; + uri->fragment = tmp; + } + + /* Identify absolute/relative URI */ + if ( ( tmp = strchr ( raw, ':' ) ) ) { + /* Absolute URI: identify hierarchical/opaque */ + uri->scheme = raw; + *(tmp++) = '\0'; + if ( *tmp == '/' ) { + /* Absolute URI with hierarchical part */ + path = tmp; + } else { + /* Absolute URI with opaque part */ + uri->opaque = tmp; + } + } else { + /* Relative URI */ + path = raw; + } + + /* If we don't have a path (i.e. we have an absolute URI with + * an opaque portion, we're already finished processing + */ + if ( ! path ) + goto done; + + /* Chop off the query, if it exists */ + if ( ( tmp = strchr ( path, '?' ) ) ) { + *(tmp++) = '\0'; + uri->query = tmp; + } + + /* Identify net/absolute/relative path */ + if ( strncmp ( path, "//", 2 ) == 0 ) { + /* Net path. If this is terminated by the first '/' + * of an absolute path, then we have no space for a + * terminator after the authority field, so shuffle + * the authority down by one byte, overwriting one of + * the two slashes. + */ + authority = ( path + 2 ); + if ( ( tmp = strchr ( authority, '/' ) ) ) { + /* Shuffle down */ + uri->path = tmp; + memmove ( ( authority - 1 ), authority, + ( tmp - authority ) ); + authority--; + *(--tmp) = '\0'; + } + } else { + /* Absolute/relative path */ + uri->path = path; + } + + /* Split authority into user[:password] and host[:port] portions */ + if ( ( tmp = strchr ( authority, '@' ) ) ) { + /* Has user[:password] */ + *(tmp++) = '\0'; + uri->host = tmp; + uri->user = authority; + if ( ( tmp = strchr ( authority, ':' ) ) ) { + /* Has password */ + *(tmp++) = '\0'; + uri->password = tmp; + } + } else { + /* No user:password */ + uri->host = authority; + } + + /* Split host into host[:port] */ + if ( ( tmp = strchr ( uri->host, ':' ) ) ) { + *(tmp++) = '\0'; + uri->port = tmp; + } + + done: + DBG ( "URI \"%s\" split into", raw ); + if ( uri->scheme ) + DBG ( " scheme \"%s\"", uri->scheme ); + if ( uri->opaque ) + DBG ( " opaque \"%s\"", uri->opaque ); + if ( uri->user ) + DBG ( " user \"%s\"", uri->user ); + if ( uri->password ) + DBG ( " password \"%s\"", uri->password ); + if ( uri->host ) + DBG ( " host \"%s\"", uri->host ); + if ( uri->port ) + DBG ( " port \"%s\"", uri->port ); + if ( uri->path ) + DBG ( " path \"%s\"", uri->path ); + if ( uri->query ) + DBG ( " query \"%s\"", uri->query ); + if ( uri->fragment ) + DBG ( " fragment \"%s\"", uri->fragment ); + DBG ( "\n" ); + + return uri; +} diff --git a/src/include/gpxe/uri.h b/src/include/gpxe/uri.h new file mode 100644 index 00000000..24f92c67 --- /dev/null +++ b/src/include/gpxe/uri.h @@ -0,0 +1,116 @@ +#ifndef _GPXE_URI_H +#define _GPXE_URI_H + +/** @file + * + * Uniform Resource Identifiers + * + */ + +#include + +/** A Uniform Resource Identifier + * + * Terminology for this data structure is as per uri(7), except that + * "path" is defined to include the leading '/' for an absolute path. + * + * Note that all fields within a URI are optional and may be NULL. + * + * Some examples are probably helpful: + * + * http://www.etherboot.org/wiki : + * + * scheme = "http", host = "www.etherboot.org", path = "/wiki" + * + * /var/lib/tftpboot : + * + * path = "/var/lib/tftpboot" + * + * mailto:bob@nowhere.com : + * + * scheme = "mailto", opaque = "bob@nowhere.com" + * + * ftp://joe:secret@insecure.org:8081/hidden/path/to?what=is#this + * + * scheme = "ftp", user = "joe", password = "secret", + * host = "insecure.org", port = "8081", path = "/hidden/path/to", + * query = "what=is", fragment = "this" + */ +struct uri { + /** Scheme */ + const char *scheme; + /** Opaque part */ + const char *opaque; + /** User name */ + const char *user; + /** Password */ + const char *password; + /** Host name */ + const char *host; + /** Port number */ + const char *port; + /** Path */ + const char *path; + /** Query */ + const char *query; + /** Fragment */ + const char *fragment; +}; + +/** + * URI is an absolute URI + * + * @v uri URI + * @ret is_absolute URI is absolute + * + * An absolute URI begins with a scheme, e.g. "http:" or "mailto:". + * Note that this is a separate concept from a URI with an absolute + * path. + */ +static inline int uri_is_absolute ( struct uri *uri ) { + return ( uri->scheme != NULL ); +} + +/** + * URI has an absolute path + * + * @v uri URI + * @ret has_absolute_path URI has an absolute path + * + * An absolute path begins with a '/'. Note that this is a separate + * concept from an absolute URI. Note also that a URI may not have a + * path at all. + */ +static inline int uri_has_absolute_path ( struct uri *uri ) { + return ( uri->path && ( uri->path[0] == '/' ) ); +} + +/** + * URI has a relative path + * + * @v uri URI + * @ret has_relative_path URI has a relative path + * + * An relative path begins with something other than a '/'. Note that + * this is a separate concept from a relative URI. Note also that a + * URI may not have a path at all. + */ +static inline int uri_has_relative_path ( struct uri *uri ) { + return ( uri->path && ( uri->path[0] != '/' ) ); +} + +/** + * Free URI structure + * + * @v uri URI + * + * Frees all the dynamically-allocated storage used by the URI + * structure. + */ +static inline void free_uri ( struct uri *uri ) { + free ( uri ); +} + +extern struct uri * parse_uri ( const char *uri_string ); + +#endif /* _GPXE_URI_H */