Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 50 additions & 59 deletions src/ua_parser/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,27 +97,45 @@
return rules


def parse_item(item: str, all: list[str] | None) -> list[str]:
if item == '*':
assert all
return all
elif item.startswith('{'):
assert item.endswith('}')
return item[1:-1].split(',')
else:
return [item]

def rules_to_parsers(args: argparse.Namespace) -> Iterator[tuple[str, str, int]]:

Check failure on line 110 in src/ua_parser/__main__.py

View workflow job for this annotation

GitHub Actions / checks

ruff (F821)

src/ua_parser/__main__.py:110:51: F821 Undefined name `Iterator`
seen = set()
for selector in args.selector:
p, c, s = selector.split(':')
for triplet in (
(pp, 'none' if ss == 0 else cc, ss)
for pp in parse_item(p, ['basic', 're2', 'regex', 'legacy'])
for cc in (parse_item(c, list(CACHES)) if CACHEABLE[pp] else ['none'])
for ss in (map(int, parse_item(s, None)) if cc != 'none' else [0])
):
if triplet not in seen:
seen.add(triplet)
yield triplet

def run_stdout(args: argparse.Namespace) -> None:
lines = list(args.file)
lines = list(map(sys.intern, args.file))
count = len(lines)
uniques = len(set(lines))
print(f"{args.file.name}: {count} lines, {uniques} unique ({uniques / count:.0%})")

rules = get_rules(args.bases, args.regexes)
parsers = list(rules_to_parsers(args))

# width of the parser label
w = math.ceil(
3
+ max(map(len, args.bases))
+ max(map(len, args.caches))
+ max(map(math.log10, args.cachesizes))
rules = get_rules([*{p for p, _, _ in parsers}], args.regexes)

w = max(
math.ceil(3 + len(p) + len(c) + (s and math.log10(s)))
for p, c, s in parsers
)
for p, c, n in (
(p, c, n)
for p in args.bases
for c in (args.caches if CACHEABLE[p] and args.cachesizes != [0] else ["none"])
for n in (args.cachesizes if c != "none" else [0])
):
for p, c, n in parsers:
name = "-".join(map(str, filter(None, (p, c != "none" and c, n))))
print(f"{name:{w}}", end=": ", flush=True)

Expand All @@ -131,24 +149,18 @@


def run_csv(args: argparse.Namespace) -> None:
lines = list(args.file)
lines = list(map(sys.intern, args.file))
LEN = len(lines) * 1000
rules = get_rules(args.bases, args.regexes)

parsers = [
(p, c, n)
for p in args.bases
for c in (args.caches if CACHEABLE[p] else ["none"])
for n in (args.cachesizes if c != "none" else [0])
]
parsers = list(rules_to_parsers(args))
if not parsers:
sys.exit("No parser selected")

rules = get_rules([*{p for p, _, _ in parsers}], args.regexes)
columns = {"size": ""}
columns.update(
(f"{p}-{c}", p if c == "none" else f"{p}-{c}")
for p in args.bases
for c in (args.caches if CACHEABLE[p] else ["none"])
for p, c, _ in parsers
)
w = csv.DictWriter(
sys.stdout,
Expand All @@ -171,11 +183,13 @@
# cache could be ignored as it should always be `"none"`
for parser, cache, _ in ps:
p = get_parser(parser, cache, 0, rules)
zeroes[f"{parser}-{cache}"] = run(p, lines) // LEN
zeroes[f"{parser}-{cache}"] = run(p, linges) // LEN

Check failure on line 186 in src/ua_parser/__main__.py

View workflow job for this annotation

GitHub Actions / checks

ruff (F821)

src/ua_parser/__main__.py:186:50: F821 Undefined name `linges`

# special cases for configurations where we can't have
# cachesize lines, write the template row out directly
if args.bases == ["legacy"] or args.caches == ["none"] or args.cachesizes == [0]:
if all(p == 'legacy' for p, _, _ in parsers)\
or all(c == 'none' for _, c, _ in parsers)\
or all(s == 0 for _, _, s in parsers):
zeroes["size"] = 0
w.writerow(zeroes)
return
Expand Down Expand Up @@ -288,7 +302,7 @@
self.count += 1
return r

lines = list(args.file)
lines = list(map(sys.intern, args.file))
total = len(lines)
uniques = len(set(lines))
print(total, "lines", uniques, "uniques")
Expand Down Expand Up @@ -343,7 +357,7 @@


def run_threaded(args: argparse.Namespace) -> None:
lines = list(args.file)
lines = list(map(sys.intern, args.file))
basic = BasicResolver(load_builtins())
resolvers: List[Tuple[str, Resolver]] = [
("locking-lru", CachingResolver(basic, caching.Lru(CACHESIZE))),
Expand Down Expand Up @@ -457,37 +471,14 @@
with a first cell of value 0.""",
)
bench.add_argument(
"--bases",
nargs="+",
choices=["basic", "re2", "regex", "legacy"],
default=["basic", "re2", "regex", "legacy"],
help="""Base resolvers to benchmark. `basic` is a linear search
through the regexes file, `re2` is a prefiltered regex set
implemented in C++, `regex` is a prefiltered regex set implemented
in Rust, `legacy` is the legacy API (essentially a basic resolver
with a clearing cache of fixed 200 entries, but less layered so
usually slightly faster than an equivalent basic-based resolver).""",
)
bench.add_argument(
"--caches",
nargs="+",
choices=list(CACHES),
default=list(CACHES),
help="""Cache implementations to test. `clearing` completely
clears the cache when full, `lru` uses a least-recently-eviction
policy. `lru` is not thread-safe, so `lru-threadsafe` adds a mutex
and measures *uncontended* locking overhead.""",
)
bench.add_argument(
"--cachesizes",
nargs="+",
type=int,
default=[10, 20, 50, 100, 200, 500, 1000, 2000, 5000],
help="""Caches are a classic way to trade memory for performances.
Different base resolvers and traffic patterns have different
benefits from caches, this option allows testing the benefits of
various cache sizes (and thus amounts of memory used) on the cache
strategies. """,
"selector",
nargs="*",
default=["*:*:{10,20,50,100,200,500,1000,2000,5000}"],
help=f"""A generative selector expression, composed of 3 parts: 1.
the parser (base), 2. the cache implementation ({', '.join(CACHES)})
and 3. the cache size. For parser and cache `*` is an alias for stands
in for "every value", a bracketed expression for an enumeration, and
the selector can be repeated to explicitly list each configuration """
)

hitrates = sub.add_parser(
Expand Down
Loading