upnp_xml.c revision 252726
1217309Snwhitehorn/* 2251843Sbapt * UPnP XML helper routines 3217309Snwhitehorn * Copyright (c) 2000-2003 Intel Corporation 4220749Snwhitehorn * Copyright (c) 2006-2007 Sony Corporation 5217309Snwhitehorn * Copyright (c) 2008-2009 Atheros Communications 6251843Sbapt * Copyright (c) 2009, Jouni Malinen <j@w1.fi> 7217309Snwhitehorn * 8217309Snwhitehorn * See wps_upnp.c for more details on licensing and code history. 9217309Snwhitehorn */ 10217309Snwhitehorn 11217309Snwhitehorn#include "includes.h" 12217309Snwhitehorn 13217309Snwhitehorn#include "common.h" 14217309Snwhitehorn#include "base64.h" 15217309Snwhitehorn#include "http.h" 16217309Snwhitehorn#include "upnp_xml.h" 17217309Snwhitehorn 18217309Snwhitehorn 19217309Snwhitehorn/* 20217309Snwhitehorn * XML parsing and formatting 21217309Snwhitehorn * 22217309Snwhitehorn * XML is a markup language based on unicode; usually (and in our case, 23217309Snwhitehorn * always!) based on utf-8. utf-8 uses a variable number of bytes per 24217309Snwhitehorn * character. utf-8 has the advantage that all non-ASCII unicode characters are 25217309Snwhitehorn * represented by sequences of non-ascii (high bit set) bytes, whereas ASCII 26217309Snwhitehorn * characters are single ascii bytes, thus we can use typical text processing. 27217309Snwhitehorn * 28217309Snwhitehorn * (One other interesting thing about utf-8 is that it is possible to look at 29217309Snwhitehorn * any random byte and determine if it is the first byte of a character as 30217309Snwhitehorn * versus a continuation byte). 31251843Sbapt * 32217309Snwhitehorn * The base syntax of XML uses a few ASCII punctionation characters; any 33217309Snwhitehorn * characters that would appear in the payload data are rewritten using 34217309Snwhitehorn * sequences, e.g., & for ampersand(&) and < for left angle bracket (<). 35217309Snwhitehorn * Five such escapes total (more can be defined but that does not apply to our 36217309Snwhitehorn * case). Thus we can safely parse for angle brackets etc. 37217309Snwhitehorn * 38217309Snwhitehorn * XML describes tree structures of tagged data, with each element beginning 39217309Snwhitehorn * with an opening tag <label> and ending with a closing tag </label> with 40217309Snwhitehorn * matching label. (There is also a self-closing tag <label/> which is supposed 41217309Snwhitehorn * to be equivalent to <label></label>, i.e., no payload, but we are unlikely 42217309Snwhitehorn * to see it for our purpose). 43217309Snwhitehorn * 44217309Snwhitehorn * Actually the opening tags are a little more complicated because they can 45217309Snwhitehorn * contain "attributes" after the label (delimited by ascii space or tab chars) 46217309Snwhitehorn * of the form attribute_label="value" or attribute_label='value'; as it turns 47217309Snwhitehorn * out we do not have to read any of these attributes, just ignore them. 48217309Snwhitehorn * 49217309Snwhitehorn * Labels are any sequence of chars other than space, tab, right angle bracket 50217309Snwhitehorn * (and ?), but may have an inner structure of <namespace><colon><plain_label>. 51217309Snwhitehorn * As it turns out, we can ignore the namespaces, in fact we can ignore the 52217309Snwhitehorn * entire tree hierarchy, because the plain labels we are looking for will be 53217309Snwhitehorn * unique (not in general, but for this application). We do however have to be 54217309Snwhitehorn * careful to skip over the namespaces. 55217309Snwhitehorn * 56217309Snwhitehorn * In generating XML we have to be more careful, but that is easy because 57217309Snwhitehorn * everything we do is pretty canned. The only real care to take is to escape 58217309Snwhitehorn * any special chars in our payload. 59217309Snwhitehorn */ 60217309Snwhitehorn 61217309Snwhitehorn/** 62217309Snwhitehorn * xml_next_tag - Advance to next tag 63217309Snwhitehorn * @in: Input 64217309Snwhitehorn * @out: OUT: start of tag just after '<' 65217309Snwhitehorn * @out_tagname: OUT: start of name of tag, skipping namespace 66217309Snwhitehorn * @end: OUT: one after tag 67217309Snwhitehorn * Returns: 0 on success, 1 on failure 68217309Snwhitehorn * 69217309Snwhitehorn * A tag has form: 70217309Snwhitehorn * <left angle bracket><...><right angle bracket> 71217309Snwhitehorn * Within the angle brackets, there is an optional leading forward slash (which 72217309Snwhitehorn * makes the tag an ending tag), then an optional leading label (followed by 73217309Snwhitehorn * colon) and then the tag name itself. 74217309Snwhitehorn * 75217309Snwhitehorn * Note that angle brackets present in the original data must have been encoded 76217309Snwhitehorn * as < and > so they will not trouble us. 77217309Snwhitehorn */ 78217309Snwhitehornint xml_next_tag(const char *in, const char **out, 79217309Snwhitehorn const char **out_tagname, const char **end) 80217309Snwhitehorn{ 81217309Snwhitehorn while (*in && *in != '<') 82217309Snwhitehorn in++; 83217309Snwhitehorn if (*in != '<') 84217309Snwhitehorn return 1; 85217309Snwhitehorn *out = ++in; 86217309Snwhitehorn if (*in == '/') 87217309Snwhitehorn in++; 88217309Snwhitehorn *out_tagname = in; /* maybe */ 89251843Sbapt while (isalnum(*in) || *in == '-') 90251843Sbapt in++; 91251843Sbapt if (*in == ':') 92251843Sbapt *out_tagname = ++in; 93251843Sbapt while (*in && *in != '>') 94251843Sbapt in++; 95251843Sbapt if (*in != '>') 96251843Sbapt return 1; 97251843Sbapt *end = ++in; 98251843Sbapt return 0; 99251843Sbapt} 100251843Sbapt 101217309Snwhitehorn 102251843Sbapt/* xml_data_encode -- format data for xml file, escaping special characters. 103251843Sbapt * 104251843Sbapt * Note that we assume we are using utf8 both as input and as output! 105251843Sbapt * In utf8, characters may be classed as follows: 106251843Sbapt * 0xxxxxxx(2) -- 1 byte ascii char 107251843Sbapt * 11xxxxxx(2) -- 1st byte of multi-byte char w/ unicode value >= 0x80 108251843Sbapt * 110xxxxx(2) -- 1st byte of 2 byte sequence (5 payload bits here) 109251843Sbapt * 1110xxxx(2) -- 1st byte of 3 byte sequence (4 payload bits here) 110251843Sbapt * 11110xxx(2) -- 1st byte of 4 byte sequence (3 payload bits here) 111251843Sbapt * 10xxxxxx(2) -- extension byte (6 payload bits per byte) 112251843Sbapt * Some values implied by the above are however illegal because they 113251843Sbapt * do not represent unicode chars or are not the shortest encoding. 114251843Sbapt * Actually, we can almost entirely ignore the above and just do 115251843Sbapt * text processing same as for ascii text. 116251843Sbapt * 117251843Sbapt * XML is written with arbitrary unicode characters, except that five 118251843Sbapt * characters have special meaning and so must be escaped where they 119251843Sbapt * appear in payload data... which we do here. 120251843Sbapt */ 121251843Sbaptvoid xml_data_encode(struct wpabuf *buf, const char *data, int len) 122251843Sbapt{ 123251843Sbapt int i; 124251843Sbapt for (i = 0; i < len; i++) { 125251843Sbapt u8 c = ((u8 *) data)[i]; 126251843Sbapt if (c == '<') { 127251843Sbapt wpabuf_put_str(buf, "<"); 128251843Sbapt continue; 129251843Sbapt } 130251843Sbapt if (c == '>') { 131251843Sbapt wpabuf_put_str(buf, ">"); 132251843Sbapt continue; 133251843Sbapt } 134251843Sbapt if (c == '&') { 135251843Sbapt wpabuf_put_str(buf, "&"); 136251843Sbapt continue; 137251843Sbapt } 138251843Sbapt if (c == '\'') { 139251843Sbapt wpabuf_put_str(buf, "'"); 140251843Sbapt continue; 141251843Sbapt } 142251843Sbapt if (c == '"') { 143251843Sbapt wpabuf_put_str(buf, """); 144251843Sbapt continue; 145251843Sbapt } 146251843Sbapt /* 147251843Sbapt * We could try to represent control characters using the 148251843Sbapt * sequence: &#x; where x is replaced by a hex numeral, but not 149251843Sbapt * clear why we would do this. 150251843Sbapt */ 151251843Sbapt wpabuf_put_u8(buf, c); 152251843Sbapt } 153251843Sbapt} 154251843Sbapt 155251843Sbapt 156251843Sbapt/* xml_add_tagged_data -- format tagged data as a new xml line. 157251843Sbapt * 158251843Sbapt * tag must not have any special chars. 159251843Sbapt * data may have special chars, which are escaped. 160251843Sbapt */ 161251843Sbaptvoid xml_add_tagged_data(struct wpabuf *buf, const char *tag, const char *data) 162251843Sbapt{ 163251843Sbapt wpabuf_printf(buf, "<%s>", tag); 164217309Snwhitehorn xml_data_encode(buf, data, os_strlen(data)); 165217309Snwhitehorn wpabuf_printf(buf, "</%s>\n", tag); 166217309Snwhitehorn} 167251843Sbapt 168217309Snwhitehorn 169217309Snwhitehorn/* A POST body looks something like (per upnp spec): 170217309Snwhitehorn * <?xml version="1.0"?> 171217309Snwhitehorn * <s:Envelope 172217309Snwhitehorn * xmlns:s="http://schemas.xmlsoap.org/soap/envelope/" 173217309Snwhitehorn * s:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"> 174217309Snwhitehorn * <s:Body> 175217309Snwhitehorn * <u:actionName xmlns:u="urn:schemas-upnp-org:service:serviceType:v"> 176217309Snwhitehorn * <argumentName>in arg value</argumentName> 177217309Snwhitehorn * other in args and their values go here, if any 178217309Snwhitehorn * </u:actionName> 179217309Snwhitehorn * </s:Body> 180217309Snwhitehorn * </s:Envelope> 181251843Sbapt * 182251843Sbapt * where : 183251843Sbapt * s: might be some other namespace name followed by colon 184217309Snwhitehorn * u: might be some other namespace name followed by colon 185251843Sbapt * actionName will be replaced according to action requested 186217309Snwhitehorn * schema following actionName will be WFA scheme instead 187251843Sbapt * argumentName will be actual argument name 188217309Snwhitehorn * (in arg value) will be actual argument value 189217309Snwhitehorn */ 190217309Snwhitehornchar * xml_get_first_item(const char *doc, const char *item) 191217309Snwhitehorn{ 192217309Snwhitehorn const char *match = item; 193251843Sbapt int match_len = os_strlen(item); 194217309Snwhitehorn const char *tag, *tagname, *end; 195217309Snwhitehorn char *value; 196217309Snwhitehorn 197251843Sbapt /* 198217309Snwhitehorn * This is crude: ignore any possible tag name conflicts and go right 199217309Snwhitehorn * to the first tag of this name. This should be ok for the limited 200251843Sbapt * domain of UPnP messages. 201251843Sbapt */ 202217309Snwhitehorn for (;;) { 203217309Snwhitehorn if (xml_next_tag(doc, &tag, &tagname, &end)) 204217309Snwhitehorn return NULL; 205217309Snwhitehorn doc = end; 206217309Snwhitehorn if (!os_strncasecmp(tagname, match, match_len) && 207217309Snwhitehorn *tag != '/' && 208217309Snwhitehorn (tagname[match_len] == '>' || 209217309Snwhitehorn !isgraph(tagname[match_len]))) { 210217309Snwhitehorn break; 211217309Snwhitehorn } 212251843Sbapt } 213251843Sbapt end = doc; 214251843Sbapt while (*end && *end != '<') 215217309Snwhitehorn end++; 216217309Snwhitehorn value = os_zalloc(1 + (end - doc)); 217217309Snwhitehorn if (value == NULL) 218217309Snwhitehorn return NULL; 219217309Snwhitehorn os_memcpy(value, doc, end - doc); 220217309Snwhitehorn return value; 221217309Snwhitehorn} 222217309Snwhitehorn 223217309Snwhitehorn 224217309Snwhitehornstruct wpabuf * xml_get_base64_item(const char *data, const char *name, 225217309Snwhitehorn enum http_reply_code *ret) 226217309Snwhitehorn{ 227217309Snwhitehorn char *msg; 228217309Snwhitehorn struct wpabuf *buf; 229217309Snwhitehorn unsigned char *decoded; 230217309Snwhitehorn size_t len; 231217309Snwhitehorn 232217309Snwhitehorn msg = xml_get_first_item(data, name); 233217309Snwhitehorn if (msg == NULL) { 234217309Snwhitehorn *ret = UPNP_ARG_VALUE_INVALID; 235217309Snwhitehorn return NULL; 236217309Snwhitehorn } 237217309Snwhitehorn 238217309Snwhitehorn decoded = base64_decode((unsigned char *) msg, os_strlen(msg), &len); 239217309Snwhitehorn os_free(msg); 240217309Snwhitehorn if (decoded == NULL) { 241217309Snwhitehorn *ret = UPNP_OUT_OF_MEMORY; 242217309Snwhitehorn return NULL; 243217309Snwhitehorn } 244217309Snwhitehorn 245217309Snwhitehorn buf = wpabuf_alloc_ext_data(decoded, len); 246217309Snwhitehorn if (buf == NULL) { 247217309Snwhitehorn os_free(decoded); 248217309Snwhitehorn *ret = UPNP_OUT_OF_MEMORY; 249217309Snwhitehorn return NULL; 250217309Snwhitehorn } 251217309Snwhitehorn return buf; 252217309Snwhitehorn} 253217309Snwhitehorn