Merge pull request #91 from Lukasa/headerdict

Lukasa · Lukasa · commit 8cab7ebec906 · 2015-03-07T18:00:38.000Z
Do headers properly.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -19,6 +19,12 @@ Primary HTTP/2 Interface
 .. autoclass:: hyper.HTTP20Push
    :inherited-members:
 
+Headers
+-------
+
+.. autoclass:: hyper.common.headers.HTTPHeaderMap
+   :inherited-members:
+
 Requests Transport Adapter
 --------------------------
 
diff --git a/hyper/common/__init__.py b/hyper/common/__init__.py
@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+"""
+hyper/common
+~~~~~~~~~~~~
+
+Common code in hyper.
+"""
diff --git a/hyper/common/headers.py b/hyper/common/headers.py
@@ -0,0 +1,232 @@
+# -*- coding: utf-8 -*-
+"""
+hyper/common/headers
+~~~~~~~~~~~~~~~~~~~~~
+
+Contains hyper's structures for storing and working with HTTP headers.
+"""
+import collections
+
+from hyper.compat import unicode, bytes, imap
+
+
+class HTTPHeaderMap(collections.MutableMapping):
+    """
+    A structure that contains HTTP headers.
+
+    HTTP headers are a curious beast. At the surface level they look roughly
+    like a name-value set, but in practice they have many variations that
+    make them tricky:
+
+    - duplicate keys are allowed
+    - keys are compared case-insensitively
+    - duplicate keys are isomorphic to comma-separated values, *except when
+      they aren't*!
+    - they logically contain a form of ordering
+
+    This data structure is an attempt to preserve all of that information
+    while being as user-friendly as possible. It retains all of the mapping
+    convenience methods (allowing by-name indexing), while avoiding using a
+    dictionary for storage.
+
+    When iterated over, this structure returns headers in 'canonical form'.
+    This form is a tuple, where the first entry is the header name (in
+    lower-case), and the second entry is a list of header values (in original
+    case).
+
+    The mapping always emits both names and values in the form of bytestrings:
+    never unicode strings. It can accept names and values in unicode form, and
+    will automatically be encoded to bytestrings using UTF-8. The reason for
+    what appears to be a user-unfriendly decision here is primarily to allow
+    the broadest-possible compatibility (to make it possible to send headers in
+    unusual encodings) while ensuring that users are never confused about what
+    type of data they will receive.
+
+    ..warning:: Note that this data structure makes none of the performance
+                guarantees of a dictionary. Lookup and deletion is not an O(1)
+                operation. Inserting a new value *is* O(1), all other
+                operations are O(n), including *replacing* a header entirely.
+    """
+    def __init__(self, *args, **kwargs):
+        # The meat of the structure. In practice, headers are an ordered list
+        # of tuples. This early version of the data structure simply uses this
+        # directly under the covers.
+        #
+        # An important curiosity here is that the headers are not stored in
+        # 'canonical form', but are instead stored in the form they were
+        # provided in. This is to ensure that it is always possible to
+        # reproduce the original header structure if necessary. This leads to
+        # some unfortunate performance costs on structure access where it is
+        # often necessary to transform the data into canonical form on access.
+        # This cost is judged acceptable in low-level code like `hyper`, but
+        # higher-level abstractions should consider if they really require this
+        # logic.
+        self._items = []
+
+        for arg in args:
+            self._items.extend(map(lambda x: _to_bytestring_tuple(*x), arg))
+
+        for k, v in kwargs.items():
+            self._items.append(_to_bytestring_tuple(k, v))
+
+    def __getitem__(self, key):
+        """
+        Unlike the dict __getitem__, this returns a list of items in the order
+        they were added. These items are returned in 'canonical form', meaning
+        that comma-separated values are split into multiple values.
+        """
+        key = _to_bytestring(key)
+        values = []
+
+        for k, v in self._items:
+            if _keys_equal(k, key):
+                values.extend(x[1] for x in canonical_form(k, v))
+
+        if not values:
+            raise KeyError("Nonexistent header key: {}".format(key))
+
+        return values
+
+    def __setitem__(self, key, value):
+        """
+        Unlike the dict __setitem__, this appends to the list of items.
+        """
+        self._items.append(_to_bytestring_tuple(key, value))
+
+    def __delitem__(self, key):
+        """
+        Sadly, __delitem__ is kind of stupid here, but the best we can do is
+        delete all headers with a given key. To correctly achieve the 'KeyError
+        on missing key' logic from dictionaries, we need to do this slowly.
+        """
+        key = _to_bytestring(key)
+        indices = []
+        for (i, (k, v)) in enumerate(self._items):
+            if _keys_equal(k, key):
+                indices.append(i)
+
+        if not indices:
+            raise KeyError("Nonexistent header key: {}".format(key))
+
+        for i in indices[::-1]:
+            self._items.pop(i)
+
+    def __iter__(self):
+        """
+        This mapping iterates like the list of tuples it is. The headers are
+        returned in canonical form.
+        """
+        for pair in self._items:
+            for value in canonical_form(*pair):
+                yield value
+
+    def __len__(self):
+        """
+        The length of this mapping is the number of individual headers in
+        canonical form. Sadly, this is a somewhat expensive operation.
+        """
+        size = 0
+        for _ in self:
+            size += 1
+
+        return size
+
+    def __contains__(self, key):
+        """
+        If any header is present with this key, returns True.
+        """
+        key = _to_bytestring(key)
+        return any(_keys_equal(key, k) for k, _ in self._items)
+
+    def keys(self):
+        """
+        Returns an iterable of the header keys in the mapping. This explicitly
+        does not filter duplicates, ensuring that it's the same length as
+        len().
+        """
+        for n, _ in self:
+            yield n
+
+    def items(self):
+        """
+        This mapping iterates like the list of tuples it is.
+        """
+        return self.__iter__()
+
+    def values(self):
+        """
+        This is an almost nonsensical query on a header dictionary, but we
+        satisfy it in the exact same way we satisfy 'keys'.
+        """
+        for _, v in self:
+            yield v
+
+    def get(self, name, default=None):
+        """
+        Unlike the dict get, this returns a list of items in the order
+        they were added.
+        """
+        try:
+            return self[name]
+        except KeyError:
+            return default
+
+    def iter_raw(self):
+        """
+        Allows iterating over the headers in 'raw' form: that is, the form in
+        which they were added to the structure. This iteration is in order,
+        and can be used to rebuild the original headers (e.g. to determine
+        exactly what a server sent).
+        """
+        for item in self._items:
+            yield item
+
+    def __eq__(self, other):
+        return self._items == other._items
+
+    def __ne__(self, other):
+        return self._items != other._items
+
+
+def canonical_form(k, v):
+    """
+    Returns an iterable of key-value-pairs corresponding to the header in
+    canonical form. This means that the header is split on commas unless for
+    any reason it's a super-special snowflake (I'm looking at you Set-Cookie).
+    """
+    SPECIAL_SNOWFLAKES = set([b'set-cookie', b'set-cookie2'])
+
+    k = k.lower()
+
+    if k in SPECIAL_SNOWFLAKES:
+        yield k, v
+    else:
+        for sub_val in v.split(b','):
+            yield k, sub_val.strip()
+
+
+def _to_bytestring(element):
+    """
+    Converts a single string to a bytestring, encoding via UTF-8 if needed.
+    """
+    if isinstance(element, unicode):
+        return element.encode('utf-8')
+    elif isinstance(element, bytes):
+        return element
+    else:
+        raise ValueError("Non string type.")
+
+
+def _to_bytestring_tuple(*x):
+    """
+    Converts the given strings to a bytestring if necessary, returning a
+    tuple.
+    """
+    return tuple(imap(_to_bytestring, x))
+
+
+def _keys_equal(x, y):
+    """
+    Returns 'True' if the two keys are equal by the laws of HTTP headers.
+    """
+    return x.lower() == y.lower()
diff --git a/hyper/compat.py b/hyper/compat.py
@@ -36,6 +36,7 @@ def ignore_missing():
 
     from urllib import urlencode
     from urlparse import urlparse, urlsplit
+    from itertools import imap
 
     def to_byte(char):
         return ord(char)
@@ -52,9 +53,14 @@ def zlib_compressobj(level=6, method=zlib.DEFLATED, wbits=15, memlevel=8,
                          strategy=zlib.Z_DEFAULT_STRATEGY):
         return zlib.compressobj(level, method, wbits, memlevel, strategy)
 
+    unicode = unicode
+    bytes = str
+
 elif is_py3:
     from urllib.parse import urlencode, urlparse, urlsplit
 
+    imap = map
+
     def to_byte(char):
         return char
 
@@ -71,3 +77,6 @@ def write_to_stdout(data):
         ssl = ssl_compat
     else:
         import ssl
+
+    unicode = str
+    bytes = bytes
diff --git a/test/test_headers.py b/test/test_headers.py