From fa2e468d9912c4a82225bbdeda4756d62e30d58e Mon Sep 17 00:00:00 2001 From: masklinn Date: Sun, 22 Feb 2026 10:16:42 +0100 Subject: [PATCH 1/2] Intern UAs on perf script intake The perf scripts do multiple passes over the input[^1] so they need the entire input in memory. However they don't need to hold every line in memory individually: UA logs tend to be pretty redundant (80 to 95% depending on the site), and the strings are the vast majority of the payload (as they average 100~150 bytes each). We can memoize the inputs in order to dedup them, and while *usually* that's not a good idea since the content has to live for the entire program lifetime we can abuse `sys.intern` for it (reduces the amount of change necessary, python GCs `sys.intern` anyway). This reduces memory consumption of the UAs list by an order of magnitude or so[^2] which is *very* significant for large logs, although note that the bedaly simulator is heinously costly: running hitrates on on the 174M UAs "sample 2" dataset, memory use falls by 10~15GB when the belady sim completes, might be a good idea to try and find out which of its collections is the source of the problem and see if it can be improved upon. This also makes hitrates significantly faster on sample 2, likely as a combination of two factors (though that has not been confirmed in any way so YMMV): - lower memory / cache trashing from having to trawl less memory - much more efficient dict hit (a pointer comparison is sufficient to validate a key after hashcode check), especially combined with sample 2 having significantly higher hit rates than sample 1 (dailymotion) as a cache hit is a dict hit first (though there's costs associated with metadata maintenance afterwards) [^1]: technically they could do just one by interleaving all the parser configurations but currently that's not the case, I also worry that this would affect CPU-level prediction although I guess since this is Python it's not that much of a worry, and UA parsing is a pretty unpredictable workload... [^2]: UA strings are 100~150 bytes on average, dedup'ing them means storing an 8 byte pointer, plus the average of the UA length over its dupes, which averages out to a handful of bytes --- src/ua_parser/__main__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ua_parser/__main__.py b/src/ua_parser/__main__.py index 047efaa..6062f7e 100644 --- a/src/ua_parser/__main__.py +++ b/src/ua_parser/__main__.py @@ -98,7 +98,7 @@ def get_rules(parsers: List[str], regexes: Optional[io.IOBase]) -> Matchers: def run_stdout(args: argparse.Namespace) -> None: - lines = list(args.file) + lines = list(map(sys.intern, args.file)) count = len(lines) uniques = len(set(lines)) print(f"{args.file.name}: {count} lines, {uniques} unique ({uniques / count:.0%})") @@ -131,7 +131,7 @@ def run_stdout(args: argparse.Namespace) -> None: def run_csv(args: argparse.Namespace) -> None: - lines = list(args.file) + lines = list(map(sys.intern, args.file)) LEN = len(lines) * 1000 rules = get_rules(args.bases, args.regexes) @@ -288,7 +288,7 @@ def __call__(self, ua: str, domains: Domain, /) -> PartialResult: self.count += 1 return r - lines = list(args.file) + lines = list(map(sys.intern, args.file)) total = len(lines) uniques = len(set(lines)) print(total, "lines", uniques, "uniques") @@ -343,7 +343,7 @@ def worker( def run_threaded(args: argparse.Namespace) -> None: - lines = list(args.file) + lines = list(map(sys.intern, args.file)) basic = BasicResolver(load_builtins()) resolvers: List[Tuple[str, Resolver]] = [ ("locking-lru", CachingResolver(basic, caching.Lru(CACHESIZE))), From e9146e9625b0dab9cc4c174eebde1225ea99c1a5 Mon Sep 17 00:00:00 2001 From: masklinn Date: Sun, 22 Feb 2026 21:47:33 +0100 Subject: [PATCH 2/2] Switch bench to a more flexible selectors system In checking things with sample 2, I've realised the current configuration system is too inflexible to allow easily running bench just once when some of the configurations are just not acceptable or sensible. A more flexible selector system rather than 3 separate options being producted together allows cleaner parser configurations, which makes running things easier. --- src/ua_parser/__main__.py | 101 +++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 55 deletions(-) diff --git a/src/ua_parser/__main__.py b/src/ua_parser/__main__.py index 6062f7e..1231058 100644 --- a/src/ua_parser/__main__.py +++ b/src/ua_parser/__main__.py @@ -97,27 +97,45 @@ def get_rules(parsers: List[str], regexes: Optional[io.IOBase]) -> Matchers: return rules +def parse_item(item: str, all: list[str] | None) -> list[str]: + if item == '*': + assert all + return all + elif item.startswith('{'): + assert item.endswith('}') + return item[1:-1].split(',') + else: + return [item] + +def rules_to_parsers(args: argparse.Namespace) -> Iterator[tuple[str, str, int]]: + seen = set() + for selector in args.selector: + p, c, s = selector.split(':') + for triplet in ( + (pp, 'none' if ss == 0 else cc, ss) + for pp in parse_item(p, ['basic', 're2', 'regex', 'legacy']) + for cc in (parse_item(c, list(CACHES)) if CACHEABLE[pp] else ['none']) + for ss in (map(int, parse_item(s, None)) if cc != 'none' else [0]) + ): + if triplet not in seen: + seen.add(triplet) + yield triplet + def run_stdout(args: argparse.Namespace) -> None: lines = list(map(sys.intern, args.file)) count = len(lines) uniques = len(set(lines)) print(f"{args.file.name}: {count} lines, {uniques} unique ({uniques / count:.0%})") - rules = get_rules(args.bases, args.regexes) + parsers = list(rules_to_parsers(args)) - # width of the parser label - w = math.ceil( - 3 - + max(map(len, args.bases)) - + max(map(len, args.caches)) - + max(map(math.log10, args.cachesizes)) + rules = get_rules([*{p for p, _, _ in parsers}], args.regexes) + + w = max( + math.ceil(3 + len(p) + len(c) + (s and math.log10(s))) + for p, c, s in parsers ) - for p, c, n in ( - (p, c, n) - for p in args.bases - for c in (args.caches if CACHEABLE[p] and args.cachesizes != [0] else ["none"]) - for n in (args.cachesizes if c != "none" else [0]) - ): + for p, c, n in parsers: name = "-".join(map(str, filter(None, (p, c != "none" and c, n)))) print(f"{name:{w}}", end=": ", flush=True) @@ -133,22 +151,16 @@ def run_stdout(args: argparse.Namespace) -> None: def run_csv(args: argparse.Namespace) -> None: lines = list(map(sys.intern, args.file)) LEN = len(lines) * 1000 - rules = get_rules(args.bases, args.regexes) - parsers = [ - (p, c, n) - for p in args.bases - for c in (args.caches if CACHEABLE[p] else ["none"]) - for n in (args.cachesizes if c != "none" else [0]) - ] + parsers = list(rules_to_parsers(args)) if not parsers: sys.exit("No parser selected") + rules = get_rules([*{p for p, _, _ in parsers}], args.regexes) columns = {"size": ""} columns.update( (f"{p}-{c}", p if c == "none" else f"{p}-{c}") - for p in args.bases - for c in (args.caches if CACHEABLE[p] else ["none"]) + for p, c, _ in parsers ) w = csv.DictWriter( sys.stdout, @@ -171,11 +183,13 @@ def run_csv(args: argparse.Namespace) -> None: # cache could be ignored as it should always be `"none"` for parser, cache, _ in ps: p = get_parser(parser, cache, 0, rules) - zeroes[f"{parser}-{cache}"] = run(p, lines) // LEN + zeroes[f"{parser}-{cache}"] = run(p, linges) // LEN # special cases for configurations where we can't have # cachesize lines, write the template row out directly - if args.bases == ["legacy"] or args.caches == ["none"] or args.cachesizes == [0]: + if all(p == 'legacy' for p, _, _ in parsers)\ + or all(c == 'none' for _, c, _ in parsers)\ + or all(s == 0 for _, _, s in parsers): zeroes["size"] = 0 w.writerow(zeroes) return @@ -457,37 +471,14 @@ def __call__( with a first cell of value 0.""", ) bench.add_argument( - "--bases", - nargs="+", - choices=["basic", "re2", "regex", "legacy"], - default=["basic", "re2", "regex", "legacy"], - help="""Base resolvers to benchmark. `basic` is a linear search - through the regexes file, `re2` is a prefiltered regex set - implemented in C++, `regex` is a prefiltered regex set implemented - in Rust, `legacy` is the legacy API (essentially a basic resolver - with a clearing cache of fixed 200 entries, but less layered so - usually slightly faster than an equivalent basic-based resolver).""", -) -bench.add_argument( - "--caches", - nargs="+", - choices=list(CACHES), - default=list(CACHES), - help="""Cache implementations to test. `clearing` completely - clears the cache when full, `lru` uses a least-recently-eviction - policy. `lru` is not thread-safe, so `lru-threadsafe` adds a mutex - and measures *uncontended* locking overhead.""", -) -bench.add_argument( - "--cachesizes", - nargs="+", - type=int, - default=[10, 20, 50, 100, 200, 500, 1000, 2000, 5000], - help="""Caches are a classic way to trade memory for performances. - Different base resolvers and traffic patterns have different - benefits from caches, this option allows testing the benefits of - various cache sizes (and thus amounts of memory used) on the cache - strategies. """, + "selector", + nargs="*", + default=["*:*:{10,20,50,100,200,500,1000,2000,5000}"], + help=f"""A generative selector expression, composed of 3 parts: 1. +the parser (base), 2. the cache implementation ({', '.join(CACHES)}) +and 3. the cache size. For parser and cache `*` is an alias for stands +in for "every value", a bracketed expression for an enumeration, and +the selector can be repeated to explicitly list each configuration """ ) hitrates = sub.add_parser(