Source code for scicomp.exam.kk.lzw

from itertools import repeat, takewhile
from typing import Iterable, Sequence, TypeVar

from more_itertools import always_iterable, collapse, first, stagger


__all__ = 'lzw_encode', 'lzw_decode'


_T = TypeVar('_T')


[docs]def lzw_encode(seq: Sequence[_T], alphabet: Iterable[_T]) -> Iterable[int]: """Encode a sequence using the `Lempel–Ziv–Welch algorithm <https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch>`_. Args: seq: the sequence to encode. alphabet: a collection of possible unique elements in `seq`. If you don't know the alphabet, go back to first grade, or get it via ``set(seq)``. Returns: The LZW encoding of `seq` as an iterable of indices into an ordered "bag" that starts off with just the alphabet and is dynamically built. Warnings: Returns an iterator, so make sure to "consume" it before e.g. saving, or if you plan to re-use the result. See Also: `lzw_decode` """ bag = list(map(tuple, map(always_iterable, alphabet))) return ( # Advance sequence, add element to bag, record index in output (seq := seq[len(s):], bag.append((*s, next(iter(seq), None))), i)[-1] # while loop for _ in takewhile(lambda x: len(seq), repeat(None)) # Find the longest element in the bag that matches at the current # position. Guaranteed to find at least one match if all elements are # in the original alphabet. I don't check the other case. for res in [first(( (bag.index(sub), sub) for j in range(2, len(seq)+1) for sub in [tuple(seq[:j])] if sub not in bag for sub in [sub[:-1]] ), None)] # if this failed, then seq must be in the bag already! for i, s in [res if res is not None else (bag.index(tuple(seq)), seq)] )
[docs]def lzw_decode(idx: Iterable[int], alphabet: Sequence[_T]) -> Iterable[_T]: """Decode a sequence previously encoded using `lzw_encode`. Args: idx: the output of `lzw_encode`, being an iterable of indices into a bag of elements. alphabet: the alphabet used to encode. This has to be the same as the one passed to `lzw_encode`: no more, no fewer elements, in the same order, otherwise gibberish will ensue! Returns: The decoded sequence. It is generally not of the same object type as the original, though, so care should be taken to do item-by-item comparison, especially with strings, which get decoded into an iterator of length-one strings (characters). Warnings: Returns an iterator, so make sure to "consume" it before e.g. saving, or if you plan to re-use the result. See Also: `lzw_encode` """ bag = list(map(tuple, map(always_iterable, alphabet))) return collapse(( # bag elements are tuples, so unpack them bag[i] for i, nexti in stagger(idx, offsets=(0, 1), longest=True) for _ in [ bag.append(( *bag[i], next(iter(bag[nexti] if len(bag) > nexti else bag[i])) )) if nexti is not None else None] ), levels=1)