upnp_xml.c revision 252726
1217309Snwhitehorn/*
2251843Sbapt * UPnP XML helper routines
3217309Snwhitehorn * Copyright (c) 2000-2003 Intel Corporation
4220749Snwhitehorn * Copyright (c) 2006-2007 Sony Corporation
5217309Snwhitehorn * Copyright (c) 2008-2009 Atheros Communications
6251843Sbapt * Copyright (c) 2009, Jouni Malinen <j@w1.fi>
7217309Snwhitehorn *
8217309Snwhitehorn * See wps_upnp.c for more details on licensing and code history.
9217309Snwhitehorn */
10217309Snwhitehorn
11217309Snwhitehorn#include "includes.h"
12217309Snwhitehorn
13217309Snwhitehorn#include "common.h"
14217309Snwhitehorn#include "base64.h"
15217309Snwhitehorn#include "http.h"
16217309Snwhitehorn#include "upnp_xml.h"
17217309Snwhitehorn
18217309Snwhitehorn
19217309Snwhitehorn/*
20217309Snwhitehorn * XML parsing and formatting
21217309Snwhitehorn *
22217309Snwhitehorn * XML is a markup language based on unicode; usually (and in our case,
23217309Snwhitehorn * always!) based on utf-8. utf-8 uses a variable number of bytes per
24217309Snwhitehorn * character. utf-8 has the advantage that all non-ASCII unicode characters are
25217309Snwhitehorn * represented by sequences of non-ascii (high bit set) bytes, whereas ASCII
26217309Snwhitehorn * characters are single ascii bytes, thus we can use typical text processing.
27217309Snwhitehorn *
28217309Snwhitehorn * (One other interesting thing about utf-8 is that it is possible to look at
29217309Snwhitehorn * any random byte and determine if it is the first byte of a character as
30217309Snwhitehorn * versus a continuation byte).
31251843Sbapt *
32217309Snwhitehorn * The base syntax of XML uses a few ASCII punctionation characters; any
33217309Snwhitehorn * characters that would appear in the payload data are rewritten using
34217309Snwhitehorn * sequences, e.g., &amp; for ampersand(&) and &lt for left angle bracket (<).
35217309Snwhitehorn * Five such escapes total (more can be defined but that does not apply to our
36217309Snwhitehorn * case). Thus we can safely parse for angle brackets etc.
37217309Snwhitehorn *
38217309Snwhitehorn * XML describes tree structures of tagged data, with each element beginning
39217309Snwhitehorn * with an opening tag <label> and ending with a closing tag </label> with
40217309Snwhitehorn * matching label. (There is also a self-closing tag <label/> which is supposed
41217309Snwhitehorn * to be equivalent to <label></label>, i.e., no payload, but we are unlikely
42217309Snwhitehorn * to see it for our purpose).
43217309Snwhitehorn *
44217309Snwhitehorn * Actually the opening tags are a little more complicated because they can
45217309Snwhitehorn * contain "attributes" after the label (delimited by ascii space or tab chars)
46217309Snwhitehorn * of the form attribute_label="value" or attribute_label='value'; as it turns
47217309Snwhitehorn * out we do not have to read any of these attributes, just ignore them.
48217309Snwhitehorn *
49217309Snwhitehorn * Labels are any sequence of chars other than space, tab, right angle bracket
50217309Snwhitehorn * (and ?), but may have an inner structure of <namespace><colon><plain_label>.
51217309Snwhitehorn * As it turns out, we can ignore the namespaces, in fact we can ignore the
52217309Snwhitehorn * entire tree hierarchy, because the plain labels we are looking for will be
53217309Snwhitehorn * unique (not in general, but for this application). We do however have to be
54217309Snwhitehorn * careful to skip over the namespaces.
55217309Snwhitehorn *
56217309Snwhitehorn * In generating XML we have to be more careful, but that is easy because
57217309Snwhitehorn * everything we do is pretty canned. The only real care to take is to escape
58217309Snwhitehorn * any special chars in our payload.
59217309Snwhitehorn */
60217309Snwhitehorn
61217309Snwhitehorn/**
62217309Snwhitehorn * xml_next_tag - Advance to next tag
63217309Snwhitehorn * @in: Input
64217309Snwhitehorn * @out: OUT: start of tag just after '<'
65217309Snwhitehorn * @out_tagname: OUT: start of name of tag, skipping namespace
66217309Snwhitehorn * @end: OUT: one after tag
67217309Snwhitehorn * Returns: 0 on success, 1 on failure
68217309Snwhitehorn *
69217309Snwhitehorn * A tag has form:
70217309Snwhitehorn *     <left angle bracket><...><right angle bracket>
71217309Snwhitehorn * Within the angle brackets, there is an optional leading forward slash (which
72217309Snwhitehorn * makes the tag an ending tag), then an optional leading label (followed by
73217309Snwhitehorn * colon) and then the tag name itself.
74217309Snwhitehorn *
75217309Snwhitehorn * Note that angle brackets present in the original data must have been encoded
76217309Snwhitehorn * as &lt; and &gt; so they will not trouble us.
77217309Snwhitehorn */
78217309Snwhitehornint xml_next_tag(const char *in, const char **out,
79217309Snwhitehorn		 const char **out_tagname, const char **end)
80217309Snwhitehorn{
81217309Snwhitehorn	while (*in && *in != '<')
82217309Snwhitehorn		in++;
83217309Snwhitehorn	if (*in != '<')
84217309Snwhitehorn		return 1;
85217309Snwhitehorn	*out = ++in;
86217309Snwhitehorn	if (*in == '/')
87217309Snwhitehorn		in++;
88217309Snwhitehorn	*out_tagname = in; /* maybe */
89251843Sbapt	while (isalnum(*in) || *in == '-')
90251843Sbapt		in++;
91251843Sbapt	if (*in == ':')
92251843Sbapt		*out_tagname = ++in;
93251843Sbapt	while (*in && *in != '>')
94251843Sbapt		in++;
95251843Sbapt	if (*in != '>')
96251843Sbapt		return 1;
97251843Sbapt	*end = ++in;
98251843Sbapt	return 0;
99251843Sbapt}
100251843Sbapt
101217309Snwhitehorn
102251843Sbapt/* xml_data_encode -- format data for xml file, escaping special characters.
103251843Sbapt *
104251843Sbapt * Note that we assume we are using utf8 both as input and as output!
105251843Sbapt * In utf8, characters may be classed as follows:
106251843Sbapt *     0xxxxxxx(2) -- 1 byte ascii char
107251843Sbapt *     11xxxxxx(2) -- 1st byte of multi-byte char w/ unicode value >= 0x80
108251843Sbapt *         110xxxxx(2) -- 1st byte of 2 byte sequence (5 payload bits here)
109251843Sbapt *         1110xxxx(2) -- 1st byte of 3 byte sequence (4 payload bits here)
110251843Sbapt *         11110xxx(2) -- 1st byte of 4 byte sequence (3 payload bits here)
111251843Sbapt *      10xxxxxx(2) -- extension byte (6 payload bits per byte)
112251843Sbapt *      Some values implied by the above are however illegal because they
113251843Sbapt *      do not represent unicode chars or are not the shortest encoding.
114251843Sbapt * Actually, we can almost entirely ignore the above and just do
115251843Sbapt * text processing same as for ascii text.
116251843Sbapt *
117251843Sbapt * XML is written with arbitrary unicode characters, except that five
118251843Sbapt * characters have special meaning and so must be escaped where they
119251843Sbapt * appear in payload data... which we do here.
120251843Sbapt */
121251843Sbaptvoid xml_data_encode(struct wpabuf *buf, const char *data, int len)
122251843Sbapt{
123251843Sbapt	int i;
124251843Sbapt	for (i = 0; i < len; i++) {
125251843Sbapt		u8 c = ((u8 *) data)[i];
126251843Sbapt		if (c == '<') {
127251843Sbapt			wpabuf_put_str(buf, "&lt;");
128251843Sbapt			continue;
129251843Sbapt		}
130251843Sbapt		if (c == '>') {
131251843Sbapt			wpabuf_put_str(buf, "&gt;");
132251843Sbapt			continue;
133251843Sbapt		}
134251843Sbapt		if (c == '&') {
135251843Sbapt			wpabuf_put_str(buf, "&amp;");
136251843Sbapt			continue;
137251843Sbapt		}
138251843Sbapt		if (c == '\'') {
139251843Sbapt			wpabuf_put_str(buf, "&apos;");
140251843Sbapt			continue;
141251843Sbapt		}
142251843Sbapt		if (c == '"') {
143251843Sbapt			wpabuf_put_str(buf, "&quot;");
144251843Sbapt			continue;
145251843Sbapt		}
146251843Sbapt		/*
147251843Sbapt		 * We could try to represent control characters using the
148251843Sbapt		 * sequence: &#x; where x is replaced by a hex numeral, but not
149251843Sbapt		 * clear why we would do this.
150251843Sbapt		 */
151251843Sbapt		wpabuf_put_u8(buf, c);
152251843Sbapt	}
153251843Sbapt}
154251843Sbapt
155251843Sbapt
156251843Sbapt/* xml_add_tagged_data -- format tagged data as a new xml line.
157251843Sbapt *
158251843Sbapt * tag must not have any special chars.
159251843Sbapt * data may have special chars, which are escaped.
160251843Sbapt */
161251843Sbaptvoid xml_add_tagged_data(struct wpabuf *buf, const char *tag, const char *data)
162251843Sbapt{
163251843Sbapt	wpabuf_printf(buf, "<%s>", tag);
164217309Snwhitehorn	xml_data_encode(buf, data, os_strlen(data));
165217309Snwhitehorn	wpabuf_printf(buf, "</%s>\n", tag);
166217309Snwhitehorn}
167251843Sbapt
168217309Snwhitehorn
169217309Snwhitehorn/* A POST body looks something like (per upnp spec):
170217309Snwhitehorn * <?xml version="1.0"?>
171217309Snwhitehorn * <s:Envelope
172217309Snwhitehorn *     xmlns:s="http://schemas.xmlsoap.org/soap/envelope/"
173217309Snwhitehorn *     s:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/">
174217309Snwhitehorn *   <s:Body>
175217309Snwhitehorn *     <u:actionName xmlns:u="urn:schemas-upnp-org:service:serviceType:v">
176217309Snwhitehorn *       <argumentName>in arg value</argumentName>
177217309Snwhitehorn *       other in args and their values go here, if any
178217309Snwhitehorn *     </u:actionName>
179217309Snwhitehorn *   </s:Body>
180217309Snwhitehorn * </s:Envelope>
181251843Sbapt *
182251843Sbapt * where :
183251843Sbapt *      s: might be some other namespace name followed by colon
184217309Snwhitehorn *      u: might be some other namespace name followed by colon
185251843Sbapt *      actionName will be replaced according to action requested
186217309Snwhitehorn *      schema following actionName will be WFA scheme instead
187251843Sbapt *      argumentName will be actual argument name
188217309Snwhitehorn *      (in arg value) will be actual argument value
189217309Snwhitehorn */
190217309Snwhitehornchar * xml_get_first_item(const char *doc, const char *item)
191217309Snwhitehorn{
192217309Snwhitehorn	const char *match = item;
193251843Sbapt	int match_len = os_strlen(item);
194217309Snwhitehorn	const char *tag, *tagname, *end;
195217309Snwhitehorn	char *value;
196217309Snwhitehorn
197251843Sbapt	/*
198217309Snwhitehorn	 * This is crude: ignore any possible tag name conflicts and go right
199217309Snwhitehorn	 * to the first tag of this name. This should be ok for the limited
200251843Sbapt	 * domain of UPnP messages.
201251843Sbapt	 */
202217309Snwhitehorn	for (;;) {
203217309Snwhitehorn		if (xml_next_tag(doc, &tag, &tagname, &end))
204217309Snwhitehorn			return NULL;
205217309Snwhitehorn		doc = end;
206217309Snwhitehorn		if (!os_strncasecmp(tagname, match, match_len) &&
207217309Snwhitehorn		    *tag != '/' &&
208217309Snwhitehorn		    (tagname[match_len] == '>' ||
209217309Snwhitehorn		     !isgraph(tagname[match_len]))) {
210217309Snwhitehorn			break;
211217309Snwhitehorn		}
212251843Sbapt	}
213251843Sbapt	end = doc;
214251843Sbapt	while (*end && *end != '<')
215217309Snwhitehorn		end++;
216217309Snwhitehorn	value = os_zalloc(1 + (end - doc));
217217309Snwhitehorn	if (value == NULL)
218217309Snwhitehorn		return NULL;
219217309Snwhitehorn	os_memcpy(value, doc, end - doc);
220217309Snwhitehorn	return value;
221217309Snwhitehorn}
222217309Snwhitehorn
223217309Snwhitehorn
224217309Snwhitehornstruct wpabuf * xml_get_base64_item(const char *data, const char *name,
225217309Snwhitehorn				    enum http_reply_code *ret)
226217309Snwhitehorn{
227217309Snwhitehorn	char *msg;
228217309Snwhitehorn	struct wpabuf *buf;
229217309Snwhitehorn	unsigned char *decoded;
230217309Snwhitehorn	size_t len;
231217309Snwhitehorn
232217309Snwhitehorn	msg = xml_get_first_item(data, name);
233217309Snwhitehorn	if (msg == NULL) {
234217309Snwhitehorn		*ret = UPNP_ARG_VALUE_INVALID;
235217309Snwhitehorn		return NULL;
236217309Snwhitehorn	}
237217309Snwhitehorn
238217309Snwhitehorn	decoded = base64_decode((unsigned char *) msg, os_strlen(msg), &len);
239217309Snwhitehorn	os_free(msg);
240217309Snwhitehorn	if (decoded == NULL) {
241217309Snwhitehorn		*ret = UPNP_OUT_OF_MEMORY;
242217309Snwhitehorn		return NULL;
243217309Snwhitehorn	}
244217309Snwhitehorn
245217309Snwhitehorn	buf = wpabuf_alloc_ext_data(decoded, len);
246217309Snwhitehorn	if (buf == NULL) {
247217309Snwhitehorn		os_free(decoded);
248217309Snwhitehorn		*ret = UPNP_OUT_OF_MEMORY;
249217309Snwhitehorn		return NULL;
250217309Snwhitehorn	}
251217309Snwhitehorn	return buf;
252217309Snwhitehorn}
253217309Snwhitehorn