1#! /usr/bin/env python
2
3from __future__ import print_function
4
5#__all__ = ['EncDec', 'EncDecSimple', 'EncDecTyped', 'EncDecA',
6#    'SequenceError', 'Sequencer']
7
8import abc
9import struct
10import sys
11
12_ProtoStruct = {
13    '1': struct.Struct('<B'),
14    '2': struct.Struct('<H'),
15    '4': struct.Struct('<I'),
16    '8': struct.Struct('<Q'),
17    '_string_': None,   # handled specially
18}
19for _i in (1, 2, 4, 8):
20    _ProtoStruct[_i] = _ProtoStruct[str(_i)]
21del _i
22
23class EncDec(object):
24    __metaclass__ = abc.ABCMeta
25    """
26    Base class for en/de-coders, which are put into sequencers.
27
28    All have a name and arbitrary user-supplied auxiliary data
29    (default=None).
30
31    All provide a pack() and unpack().  The pack() function
32    returns a "bytes" value.  This is internally implemented as a
33    function apack() that returns a list of struct.pack() bytes,
34    and pack() just joins them up as needed.
35
36    The pack/unpack functions take a dictionary of variable names
37    and values, and a second dictionary for conditionals, but at
38    this level conditionals don't apply: they are just being
39    passed through.  Variable names do apply to array encoders
40
41    EncDec also provide b2s() and s2b() static methods, which
42    convert strings to bytes and vice versa, as reversibly as
43    possible (using surrogateescape encoding). In Python2 this is
44    a no-op since the string type *is* the bytes type (<type
45    'unicode'>) is the unicode-ized string type).
46
47    EncDec also provides b2u() and u2b() to do conversion to/from
48    Unicode.
49
50    These are partly for internal use (all strings get converted
51    to UTF-8 byte sequences when coding a _string_ type) and partly
52    for doctests, where we just want some py2k/py3k compat hacks.
53    """
54    def __init__(self, name, aux):
55        self.name = name
56        self.aux = aux
57
58    @staticmethod
59    def b2u(byte_sequence):
60        "transform bytes to unicode"
61        return byte_sequence.decode('utf-8', 'surrogateescape')
62
63    @staticmethod
64    def u2b(unicode_sequence):
65        "transform unicode to bytes"
66        return unicode_sequence.encode('utf-8', 'surrogateescape')
67
68    if sys.version_info[0] >= 3:
69        b2s = b2u
70        @staticmethod
71        def s2b(string):
72            "transform string to bytes (leaves raw byte sequence unchanged)"
73            if isinstance(string, bytes):
74                return string
75            return string.encode('utf-8', 'surrogateescape')
76    else:
77        @staticmethod
78        def b2s(byte_sequence):
79            "transform bytes to string - no-op in python2.7"
80            return byte_sequence
81        @staticmethod
82        def s2b(string):
83            "transform string or unicode to bytes"
84            if isinstance(string, unicode):
85                return string.encode('utf-8', 'surrogateescape')
86            return string
87
88    def pack(self, vdict, cdict, val):
89        "encode value <val> into a byte-string"
90        return b''.join(self.apack(vdict, cdict, val))
91
92    @abc.abstractmethod
93    def apack(self, vdict, cdict, val):
94        "encode value <val> into [bytes1, b2, ..., bN]"
95
96    @abc.abstractmethod
97    def unpack(self, vdict, cdict, bstring, offset, noerror=False):
98        "unpack bytes from <bstring> at <offset>"
99
100
101class EncDecSimple(EncDec):
102    r"""
103    Encode/decode a simple (but named) field.  The field is not an
104    array, which requires using EncDecA, nor a typed object
105    like a qid or stat instance -- those require a Sequence and
106    EncDecTyped.
107
108    The format is one of '1'/1, '2'/2, '4'/4, '8'/8, or '_string_'.
109
110    Note: using b2s here is purely a doctest/tetsmod python2/python3
111    compat hack.  The output of e.pack is <type 'bytes'>; b2s
112    converts it to a string, purely for display purposes.  (It might
113    be better to map py2 output to bytes but they just print as a
114    string anyway.)  In normal use, you should not call b2s here.
115
116    >>> e = EncDecSimple('eggs', 2)
117    >>> e.b2s(e.pack({}, {}, 0))
118    '\x00\x00'
119    >>> e.b2s(e.pack({}, {}, 256))
120    '\x00\x01'
121
122    Values that cannot be packed produce a SequenceError:
123
124    >>> e.pack({}, {}, None)
125    Traceback (most recent call last):
126        ...
127    SequenceError: failed while packing 'eggs'=None
128    >>> e.pack({}, {}, -1)
129    Traceback (most recent call last):
130        ...
131    SequenceError: failed while packing 'eggs'=-1
132
133    Unpacking both returns a value, and tells how many bytes it
134    used out of the bytestring or byte-array argument.  If there
135    are not enough bytes remaining at the starting offset, it
136    raises a SequenceError, unless noerror=True (then unset
137    values are None)
138
139    >>> e.unpack({}, {}, b'\x00\x01', 0)
140    (256, 2)
141    >>> e.unpack({}, {}, b'', 0)
142    Traceback (most recent call last):
143        ...
144    SequenceError: out of data while unpacking 'eggs'
145    >>> e.unpack({}, {}, b'', 0, noerror=True)
146    (None, 2)
147
148    Note that strings can be provided as regular strings, byte
149    strings (same as regular strings in py2k), or Unicode strings
150    (same as regular strings in py3k).  Unicode strings will be
151    converted to UTF-8 before being packed.  Since this leaves
152    7-bit characters alone, these examples work in both py2k and
153    py3k.  (Note: the UTF-8 encoding of u'\u1234' is
154    '\0xe1\0x88\0xb4' or 225, 136, 180. The b2i trick below is
155    another py2k vs py3k special case just for doctests: py2k
156    tries to display the utf-8 encoded data as a string.)
157
158    >>> e = EncDecSimple('spam', '_string_')
159    >>> e.b2s(e.pack({}, {}, 'p3=unicode,p2=bytes'))
160    '\x13\x00p3=unicode,p2=bytes'
161
162    >>> e.b2s(e.pack({}, {}, b'bytes'))
163    '\x05\x00bytes'
164
165    >>> import sys
166    >>> ispy3k = sys.version_info[0] >= 3
167
168    >>> b2i = lambda x: x if ispy3k else ord(x)
169    >>> [b2i(x) for x in e.pack({}, {}, u'\u1234')]
170    [3, 0, 225, 136, 180]
171
172    The byte length of the utf-8 data cannot exceed 65535 since
173    the encoding has the length as a 2-byte field (a la the
174    encoding for 'eggs' here).  A too-long string produces
175    a SequenceError as well.
176
177    >>> e.pack({}, {}, 16384 * 'spam')
178    Traceback (most recent call last):
179        ...
180    SequenceError: string too long (len=65536) while packing 'spam'
181
182    Unpacking strings produces byte arrays.  (Of course,
183    in py2k these are also known as <type 'str'>.)
184
185    >>> unpacked = e.unpack({}, {}, b'\x04\x00data', 0)
186    >>> etype = bytes if ispy3k else str
187    >>> print(isinstance(unpacked[0], etype))
188    True
189    >>> e.b2s(unpacked[0])
190    'data'
191    >>> unpacked[1]
192    6
193
194    You may use e.b2s() to conver them to unicode strings in py3k,
195    or you may set e.autob2s.  This still only really does
196    anything in py3k, since py2k strings *are* bytes, so it's
197    really just intended for doctest purposes (see EncDecA):
198
199    >>> e.autob2s = True
200    >>> e.unpack({}, {}, b'\x07\x00stringy', 0)
201    ('stringy', 9)
202    """
203    def __init__(self, name, fmt, aux=None):
204        super(EncDecSimple, self).__init__(name, aux)
205        self.fmt = fmt
206        self.struct = _ProtoStruct[fmt]
207        self.autob2s = False
208
209    def __repr__(self):
210        if self.aux is None:
211            return '{0}({1!r}, {2!r})'.format(self.__class__.__name__,
212                self.name, self.fmt)
213        return '{0}({1!r}, {2!r}, {3!r})'.format(self.__class__.__name__,
214            self.name, self.fmt, self.aux)
215
216    __str__ = __repr__
217
218    def apack(self, vdict, cdict, val):
219        "encode a value"
220        try:
221            if self.struct:
222                return [self.struct.pack(val)]
223            sval = self.s2b(val)
224            if len(sval) > 65535:
225                raise SequenceError('string too long (len={0:d}) '
226                    'while packing {1!r}'.format(len(sval), self.name))
227            return [EncDecSimple.string_len.pack(len(sval)), sval]
228        # Include AttributeError in case someone tries to, e.g.,
229        # pack name=None and self.s2b() tries to use .encode on it.
230        except (struct.error, AttributeError):
231            raise SequenceError('failed '
232                'while packing {0!r}={1!r}'.format(self.name, val))
233
234    def _unpack1(self, via, bstring, offset, noerror):
235        "internal function to unpack single item"
236        try:
237            tup = via.unpack_from(bstring, offset)
238        except struct.error as err:
239            if 'unpack_from requires a buffer of at least' in str(err):
240                if noerror:
241                    return None, offset + via.size
242                raise SequenceError('out of data '
243                    'while unpacking {0!r}'.format(self.name))
244            # not clear what to do here if noerror
245            raise SequenceError('failed '
246                'while unpacking {0!r}'.format(self.name))
247        assert len(tup) == 1
248        return tup[0], offset + via.size
249
250    def unpack(self, vdict, cdict, bstring, offset, noerror=False):
251        "decode a value; return the value and the new offset"
252        if self.struct:
253            return self._unpack1(self.struct, bstring, offset, noerror)
254        slen, offset = self._unpack1(EncDecSimple.string_len, bstring, offset,
255            noerror)
256        if slen is None:
257            return None, offset
258        nexto = offset + slen
259        if len(bstring) < nexto:
260            if noerror:
261                val = None
262            else:
263                raise SequenceError('out of data '
264                    'while unpacking {0!r}'.format(self.name))
265        else:
266            val = bstring[offset:nexto]
267            if self.autob2s:
268                val = self.b2s(val)
269        return val, nexto
270
271# string length: 2 byte unsigned field
272EncDecSimple.string_len = _ProtoStruct[2]
273
274class EncDecTyped(EncDec):
275    r"""
276    EncDec for typed objects (which are build from PFODs, which are
277    a sneaky class variant of OrderedDict similar to namedtuple).
278
279    Calling the klass() function with no arguments must create an
280    instance with all-None members.
281
282    We also require a Sequencer to pack and unpack the members of
283    the underlying pfod.
284
285    >>> qid_s = Sequencer('qid')
286    >>> qid_s.append_encdec(None, EncDecSimple('type', 1))
287    >>> qid_s.append_encdec(None, EncDecSimple('version', 4))
288    >>> qid_s.append_encdec(None, EncDecSimple('path', 8))
289    >>> len(qid_s)
290    3
291
292    >>> from pfod import pfod
293    >>> qid = pfod('qid', ['type', 'version', 'path'])
294    >>> len(qid._fields)
295    3
296    >>> qid_inst = qid(1, 2, 3)
297    >>> qid_inst
298    qid(type=1, version=2, path=3)
299
300    >>> e = EncDecTyped(qid, 'aqid', qid_s)
301    >>> e.b2s(e.pack({}, {}, qid_inst))
302    '\x01\x02\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00'
303    >>> e.unpack({}, {},
304    ... b'\x01\x02\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00', 0)
305    (qid(type=1, version=2, path=3), 13)
306
307    If an EncDecTyped instance has a conditional sequencer, note
308    that unpacking will leave un-selected items set to None (see
309    the Sequencer example below):
310
311    >>> breakfast = pfod('breakfast', 'eggs spam ham')
312    >>> breakfast()
313    breakfast(eggs=None, spam=None, ham=None)
314    >>> bfseq = Sequencer('breakfast')
315    >>> bfseq.append_encdec(None, EncDecSimple('eggs', 1))
316    >>> bfseq.append_encdec('yuck', EncDecSimple('spam', 1))
317    >>> bfseq.append_encdec(None, EncDecSimple('ham', 1))
318    >>> e = EncDecTyped(breakfast, 'bfname', bfseq)
319    >>> e.unpack({}, {'yuck': False}, b'\x02\x01\x04', 0)
320    (breakfast(eggs=2, spam=None, ham=1), 2)
321
322    This used just two of the three bytes: eggs=2, ham=1.
323
324    >>> e.unpack({}, {'yuck': True}, b'\x02\x01\x04', 0)
325    (breakfast(eggs=2, spam=1, ham=4), 3)
326
327    This used the third byte, so ham=4.
328    """
329    def __init__(self, klass, name, sequence, aux=None):
330        assert len(sequence) == len(klass()._fields) # temporary
331        super(EncDecTyped, self).__init__(name, aux)
332        self.klass = klass
333        self.name = name
334        self.sequence = sequence
335
336    def __repr__(self):
337        if self.aux is None:
338            return '{0}({1!r}, {2!r}, {3!r})'.format(self.__class__.__name__,
339                self.klass, self.name, self.sequence)
340        return '{0}({1!r}, {2!r}, {3!r}, {4!r})'.format(self.__class__.__name__,
341            self.klass, self.name, self.sequence, self.aux)
342
343    __str__ = __repr__
344
345    def apack(self, vdict, cdict, val):
346        """
347        Pack each of our instance variables.
348
349        Note that some packing may be conditional.
350        """
351        return self.sequence.apack(val, cdict)
352
353    def unpack(self, vdict, cdict, bstring, offset, noerror=False):
354        """
355        Unpack each instance variable, into a new object of
356        self.klass.  Return the new instance and new offset.
357
358        Note that some unpacking may be conditional.
359        """
360        obj = self.klass()
361        offset = self.sequence.unpack_from(obj, cdict, bstring, offset, noerror)
362        return obj, offset
363
364class EncDecA(EncDec):
365    r"""
366    EncDec for arrays (repeated objects).
367
368    We take the name of repeat count variable, and a sub-coder
369    (Sequencer instance).  For instance, we can en/de-code
370    repeat='nwname' copies of name='wname', or nwname of
371    name='wqid', in a Twalk en/de-code.
372
373    Note that we don't pack or unpack the repeat count itself --
374    that must be done by higher level code.  We just get its value
375    from vdict.
376
377    >>> subcode = EncDecSimple('wname', '_string_')
378    >>> e = EncDecA('nwname', 'wname', subcode)
379    >>> e.b2s(e.pack({'nwname': 2}, {}, ['A', 'BC']))
380    '\x01\x00A\x02\x00BC'
381
382    >>> subcode.autob2s = True # so that A and BC decode to py3k str
383    >>> e.unpack({'nwname': 2}, {}, b'\x01\x00A\x02\x00BC', 0)
384    (['A', 'BC'], 7)
385
386    When using noerror, the first sub-item that fails to decode
387    completely starts the None-s.  Strings whose length fails to
388    decode are assumed to be zero bytes long as well, for the
389    purpose of showing the expected packet length:
390
391    >>> e.unpack({'nwname': 2}, {}, b'\x01\x00A\x02\x00', 0, noerror=True)
392    (['A', None], 7)
393    >>> e.unpack({'nwname': 2}, {}, b'\x01\x00A\x02', 0, noerror=True)
394    (['A', None], 5)
395    >>> e.unpack({'nwname': 3}, {}, b'\x01\x00A\x02', 0, noerror=True)
396    (['A', None, None], 7)
397
398    As a special case, supplying None for the sub-coder
399    makes the repeated item pack or unpack a simple byte
400    string.  (Note that autob2s is not supported here.)
401    A too-short byte string is simply truncated!
402
403    >>> e = EncDecA('count', 'data', None)
404    >>> e.b2s(e.pack({'count': 5}, {}, b'12345'))
405    '12345'
406    >>> x = list(e.unpack({'count': 3}, {}, b'123', 0))
407    >>> x[0] = e.b2s(x[0])
408    >>> x
409    ['123', 3]
410    >>> x = list(e.unpack({'count': 3}, {}, b'12', 0, noerror=True))
411    >>> x[0] = e.b2s(x[0])
412    >>> x
413    ['12', 3]
414    """
415    def __init__(self, repeat, name, sub, aux=None):
416        super(EncDecA, self).__init__(name, aux)
417        self.repeat = repeat
418        self.name = name
419        self.sub = sub
420
421    def __repr__(self):
422        if self.aux is None:
423            return '{0}({1!r}, {2!r}, {3!r})'.format(self.__class__.__name__,
424                self.repeat, self.name, self.sub)
425        return '{0}({1!r}, {2!r}, {3!r}, {4!r})'.format(self.__class__.__name__,
426            self.repeat, self.name, self.sub, self.aux)
427
428    __str__ = __repr__
429
430    def apack(self, vdict, cdict, val):
431        "pack each val[i], for i in range(vdict[self.repeat])"
432        num = vdict[self.repeat]
433        assert num == len(val)
434        if self.sub is None:
435            assert isinstance(val, bytes)
436            return [val]
437        parts = []
438        for i in val:
439            parts.extend(self.sub.apack(vdict, cdict, i))
440        return parts
441
442    def unpack(self, vdict, cdict, bstring, offset, noerror=False):
443        "unpack repeatedly, per self.repeat, into new array."
444        num = vdict[self.repeat]
445        if num is None and noerror:
446            num = 0
447        else:
448            assert num >= 0
449        if self.sub is None:
450            nexto = offset + num
451            if len(bstring) < nexto and not noerror:
452                raise SequenceError('out of data '
453                    'while unpacking {0!r}'.format(self.name))
454            return bstring[offset:nexto], nexto
455        array = []
456        for i in range(num):
457            obj, offset = self.sub.unpack(vdict, cdict, bstring, offset,
458                noerror)
459            array.append(obj)
460        return array, offset
461
462class SequenceError(Exception):
463    "sequence error: item too big, or ran out of data"
464    pass
465
466class Sequencer(object):
467    r"""
468    A sequencer is an object that packs (marshals) or unpacks
469    (unmarshals) a series of objects, according to their EncDec
470    instances.
471
472    The objects themselves (and their values) come from, or
473    go into, a dictionary: <vdict>, the first argument to
474    pack/unpack.
475
476    Some fields may be conditional.  The conditions are in a
477    separate dictionary (the second or <cdict> argument).
478
479    Some objects may be dictionaries or PFODs, e.g., they may
480    be a Plan9 qid or stat structure.  These have their own
481    sub-encoding.
482
483    As with each encoder, we have both an apack() function
484    (returns a list of parts) and a plain pack().  Users should
485    mostly stick with plain pack().
486
487    >>> s = Sequencer('monty')
488    >>> s
489    Sequencer('monty')
490    >>> e = EncDecSimple('eggs', 2)
491    >>> s.append_encdec(None, e)
492    >>> s.append_encdec(None, EncDecSimple('spam', 1))
493    >>> s[0]
494    (None, EncDecSimple('eggs', 2))
495    >>> e.b2s(s.pack({'eggs': 513, 'spam': 65}, {}))
496    '\x01\x02A'
497
498    When particular fields are conditional, they appear in
499    packed output, or are taken from the byte-string during
500    unpacking, only if their condition is true.
501
502    As with struct, use unpack_from to start at an arbitrary
503    offset and/or omit verification that the entire byte-string
504    is consumed.
505
506    >>> s = Sequencer('python')
507    >>> s.append_encdec(None, e)
508    >>> s.append_encdec('.u', EncDecSimple('spam', 1))
509    >>> s[1]
510    ('.u', EncDecSimple('spam', 1))
511    >>> e.b2s(s.pack({'eggs': 513, 'spam': 65}, {'.u': True}))
512    '\x01\x02A'
513    >>> e.b2s(s.pack({'eggs': 513, 'spam': 65}, {'.u': False}))
514    '\x01\x02'
515
516    >>> d = {}
517    >>> s.unpack(d, {'.u': True}, b'\x01\x02A')
518    >>> print(d['eggs'], d['spam'])
519    513 65
520    >>> d = {}
521    >>> s.unpack(d, {'.u': False}, b'\x01\x02A', 0)
522    Traceback (most recent call last):
523        ...
524    SequenceError: 1 byte(s) unconsumed
525    >>> s.unpack_from(d, {'.u': False}, b'\x01\x02A', 0)
526    2
527    >>> print(d)
528    {'eggs': 513}
529
530    The incoming dictionary-like object may be pre-initialized
531    if you like; only sequences that decode are filled-in:
532
533    >>> d = {'eggs': None, 'spam': None}
534    >>> s.unpack_from(d, {'.u': False}, b'\x01\x02A', 0)
535    2
536    >>> print(d['eggs'], d['spam'])
537    513 None
538
539    Some objects may be arrays; if so their EncDec is actually
540    an EncDecA, the repeat count must be in the dictionary, and
541    the object itself must have a len() and be index-able:
542
543    >>> s = Sequencer('arr')
544    >>> s.append_encdec(None, EncDecSimple('n', 1))
545    >>> ae = EncDecSimple('array', 2)
546    >>> s.append_encdec(None, EncDecA('n', 'array', ae))
547    >>> ae.b2s(s.pack({'n': 2, 'array': [257, 514]}, {}))
548    '\x02\x01\x01\x02\x02'
549
550    Unpacking an array creates a list of the number of items.
551    The EncDec encoder that decodes the number of items needs to
552    occur first in the sequencer, so that the dictionary will have
553    acquired the repeat-count variable's value by the time we hit
554    the array's encdec:
555
556    >>> d = {}
557    >>> s.unpack(d, {}, b'\x01\x04\x00')
558    >>> d['n'], d['array']
559    (1, [4])
560    """
561    def __init__(self, name):
562        self.name = name
563        self._codes = []
564        self.debug = False # or sys.stderr
565
566    def __repr__(self):
567        return '{0}({1!r})'.format(self.__class__.__name__, self.name)
568
569    __str__ = __repr__
570
571    def __len__(self):
572        return len(self._codes)
573
574    def __iter__(self):
575        return iter(self._codes)
576
577    def __getitem__(self, index):
578        return self._codes[index]
579
580    def dprint(self, *args, **kwargs):
581        if not self.debug:
582            return
583        if isinstance(self.debug, bool):
584            dest = sys.stdout
585        else:
586            dest = self.debug
587        print(*args, file=dest, **kwargs)
588
589    def append_encdec(self, cond, code):
590        "add EncDec en/de-coder, conditional on cond"
591        self._codes.append((cond, code))
592
593    def apack(self, vdict, cdict):
594        """
595        Produce packed representation of each field.
596        """
597        packed_data = []
598        for cond, code in self._codes:
599            # Skip this item if it's conditional on a false thing.
600            if cond is not None and not cdict[cond]:
601                self.dprint('skip %r - %r is False' % (code, cond))
602                continue
603
604            # Pack the item.
605            self.dprint('pack %r - no cond or %r is True' % (code, cond))
606            packed_data.extend(code.apack(vdict, cdict, vdict[code.name]))
607
608        return packed_data
609
610    def pack(self, vdict, cdict):
611        """
612        Flatten packed data.
613        """
614        return b''.join(self.apack(vdict, cdict))
615
616    def unpack_from(self, vdict, cdict, bstring, offset=0, noerror=False):
617        """
618        Unpack from byte string.
619
620        The values are unpacked into a dictionary vdict;
621        some of its entries may themselves be ordered
622        dictionaries created by typedefed codes.
623
624        Raises SequenceError if the string is too short,
625        unless you set noerror, in which case we assume
626        you want see what you can get out of the data.
627        """
628        for cond, code in self._codes:
629            # Skip this item if it's conditional on a false thing.
630            if cond is not None and not cdict[cond]:
631                self.dprint('skip %r - %r is False' % (code, cond))
632                continue
633
634            # Unpack the item.
635            self.dprint('unpack %r - no cond or %r is True' % (code, cond))
636            obj, offset = code.unpack(vdict, cdict, bstring, offset, noerror)
637            vdict[code.name] = obj
638
639        return offset
640
641    def unpack(self, vdict, cdict, bstring, noerror=False):
642        """
643        Like unpack_from but unless noerror=True, requires that
644        we completely use up the given byte string.
645        """
646        offset = self.unpack_from(vdict, cdict, bstring, 0, noerror)
647        if not noerror and offset != len(bstring):
648            raise SequenceError('{0} byte(s) unconsumed'.format(
649                len(bstring) - offset))
650
651if __name__ == '__main__':
652    import doctest
653    doctest.testmod()
654