A Quick Note on an Inverted Index Optimization

import functools
import numpy as np
import math
import time
import matplotlib.pyplot as plt

LAMBDA = 0.25
N_POSTINGS_LISTS = 5
N_SAMPLES = 16 * 1024
DOCUMENT_ID_SPACE_SZ = 1024

RNG = np.random.default_rng()

# This is more natural, but w.o. the early return we kinda smear performance...
# See `foldl_intersection` def'n below...
#
# def foldl_intersection(sets):
#    return functools.reduce(lambda acc, s: acc & s, sets)

def foldl_intersection(sets):
    it = iter(sets)
    try:
        acc = next(it)
    except StopIteration:
        return set()
    for s in it:
        acc &= s
        if not acc:
            return set()
    return acc
    
def chain_sampler(arr: list[int], n: int, sort: bool = False) -> list[set[int]]:
    ll = len(arr)
    bnd = RNG.exponential(ll * LAMBDA, n)
    if sort:
        bnd.sort()
    return [set(arr[:min(math.ceil(b), ll - 1)]) for b in bnd]

def unif_sampler(arr: list[int], n: int, sort: bool = False) -> list[set[int]]:
    ll = len(arr)
    bnd = RNG.exponential(ll * LAMBDA, n)
    if sort:
        bnd.sort()
    return [set(RNG.choice(ll, size=min(math.ceil(b), ll - 1), replace=False).tolist()) for b in bnd]

def time_folds(sampler, sort: bool) -> np.ndarray:
    arr = list(range(DOCUMENT_ID_SPACE_SZ))
    samples = [sampler(arr, N_POSTINGS_LISTS, sort=sort) for _ in range(N_SAMPLES)]
    times = np.empty(N_SAMPLES)
    for i, lists in enumerate(samples):
        t0 = time.perf_counter_ns()
        foldl_intersection(lists)
        times[i] = time.perf_counter_ns() - t0
    return times

fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
for ax, (name, sampler) in zip(axes, [("chain", chain_sampler), ("unif", unif_sampler)]):
    unsorted_times = time_folds(sampler, sort=False)
    sorted_times = time_folds(sampler, sort=True)
    ratio = np.median(unsorted_times) / np.median(sorted_times)

    lo = min(unsorted_times.min(), sorted_times.min())
    hi = max(unsorted_times.max(), sorted_times.max())
    bins = np.logspace(np.log10(lo), np.log10(hi), 60)

    ax.hist(unsorted_times, bins=bins, alpha=0.5, label='unsorted')
    ax.hist(sorted_times, bins=bins, alpha=0.5, label='sorted')
    ax.set_xscale('log')
    ax.set_xlabel('ns per foldl_intersection')
    if name == "chain":
        ww = math.log(N_POSTINGS_LISTS)
    else:
        ww = N_POSTINGS_LISTS
    ax.set_title(f"{name}_sampler (median ratio: {ratio:.2f}x, proj: {ww:.2f}x)")
    ax.legend()