diff --git a/Lib/difflib.py b/Lib/difflib.py index 7c7e233b013a76..eb385d86ad76a4 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -19,6 +19,11 @@ Class SequenceMatcher: A flexible class for comparing pairs of sequences of any type. +Class GestaltSequenceMatcher: + Class for comparing pairs of sequences that uses SuffixAutomaton. + It does not have autojunk option and always calculates exact result. + Additionally, it has balancing "knob" to improve quality of diffs. + Class Differ: For producing human-readable deltas from sequences of lines of text. @@ -28,99 +33,93 @@ __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', - 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match'] + 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match', + 'GestaltSequenceMatcher'] from _colorize import can_colorize, get_theme from heapq import nlargest as _nlargest -from collections import namedtuple as _namedtuple +from collections import Counter as _Counter, namedtuple as _namedtuple from types import GenericAlias +from sys import maxsize as _MAXSIZE -Match = _namedtuple('Match', 'a b size') - -def _calculate_ratio(matches, length): - if length: - return 2.0 * matches / length - return 1.0 - -class SequenceMatcher: - - """ - SequenceMatcher is a flexible class for comparing pairs of sequences of - any type, so long as the sequence elements are hashable. The basic - algorithm predates, and is a little fancier than, an algorithm - published in the late 1980's by Ratcliff and Obershelp under the - hyperbolic name "gestalt pattern matching". The basic idea is to find - the longest contiguous matching subsequence that contains no "junk" - elements (R-O doesn't address junk). The same idea is then applied - recursively to the pieces of the sequences to the left and to the right - of the matching subsequence. This does not yield minimal edit - sequences, but does tend to yield matches that "look right" to people. - SequenceMatcher tries to compute a "human-friendly diff" between two - sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the - longest *contiguous* & junk-free matching subsequence. That's what - catches peoples' eyes. The Windows(tm) windiff has another interesting - notion, pairing up elements that appear uniquely in each sequence. - That, and the method here, appear to yield more intuitive difference - reports than does diff. This method appears to be the least vulnerable - to syncing up on blocks of "junk lines", though (like blank lines in - ordinary text files, or maybe "

" lines in HTML files). That may be - because this is the only method of the 3 that has a *concept* of - "junk" . +######################################################################## +### Utilities +######################################################################## - Example, comparing two strings, and considering blanks to be "junk": - >>> s = SequenceMatcher(lambda x: x == " ", - ... "private Thread currentThread;", - ... "private volatile Thread currentThread;") - >>> +def _adjust_indices(size, start, stop): + if start < 0: + raise ValueError('Starting index can not be negative') + if stop is None or stop > size: + stop = size + return start, stop - .ratio() returns a float in [0, 1], measuring the "similarity" of the - sequences. As a rule of thumb, a .ratio() value over 0.6 means the - sequences are close matches: - >>> print(round(s.ratio(), 2)) - 0.87 - >>> +def _collapse_adjacent_blocks(blocks): + """Collapses adjacent blocks + """ + i1 = j1 = k1 = 0 + for i2, j2, k2 in blocks: + # Is this block adjacent to i1, j1, k1? + if i1 + k1 == i2 and j1 + k1 == j2: + # Yes, so collapse them -- this just increases the length of + # the first block by the length of the second, and the first + # block so lengthened remains the block to compare against. + k1 += k2 + else: + # Not adjacent. Remember the first block (k1==0 means it's + # the dummy we started with), and make the second block the + # new block to compare against. + if k1: + yield (i1, j1, k1) + i1, j1, k1 = i2, j2, k2 + if k1: + yield (i1, j1, k1) - If you're only interested in where the sequences match, - .get_matching_blocks() is handy: - >>> for block in s.get_matching_blocks(): - ... print("a[%d] and b[%d] match for %d elements" % block) - a[0] and b[0] match for 8 elements - a[8] and b[17] match for 21 elements - a[29] and b[38] match for 0 elements +def _expand_block_to_junk(junk, block, a, b, alo, ahi, blo, bhi, *, inverse=False): + """ + Expands block for consecutive matches at both sides if: + a) characters match + b) matching characters are in junk + If inverse == True, (b) condition is inverted to: "are not in junk" + """ + i, j, k = block + while i > alo and j > blo: + el2 = b[j - 1] + ok = el2 not in junk if inverse else el2 in junk + if not ok or a[i - 1] != el2: + break + i -= 1 + j -= 1 + k += 1 + while i + k < ahi and j + k < bhi: + el2 = b[j + k] + ok = el2 not in junk if inverse else el2 in junk + if not ok or a[i + k] != el2: + break + k += 1 + return (i, j, k) - Note that the last tuple returned by .get_matching_blocks() is always a - dummy, (len(a), len(b), 0), and this is the only case in which the last - tuple element (number of elements matched) is 0. - If you want to know how to change the first sequence into the second, - use .get_opcodes(): +######################################################################## +### SequenceMatcherBase +######################################################################## - >>> for opcode in s.get_opcodes(): - ... print("%6s a[%d:%d] b[%d:%d]" % opcode) - equal a[0:8] b[0:8] - insert a[8:8] b[8:17] - equal a[8:29] b[17:38] - See the Differ class for a fancy human-friendly file differencer, which - uses SequenceMatcher both to compare sequences of lines, and to compare - sequences of characters within similar (near-matching) lines. +Match = _namedtuple('Match', 'a b size') - See also function get_close_matches() in this module, which shows how - simple code building on SequenceMatcher can be used to do useful work. - Timing: Basic R-O is cubic time worst case and quadratic time expected - case. SequenceMatcher is quadratic time for the worst case and has - expected-case behavior dependent in a complicated way on how many - elements the sequences have in common; best case time is linear. - """ +def _calculate_ratio(matches, length): + if length: + return 2.0 * matches / length + return 1.0 - def __init__(self, isjunk=None, a='', b='', autojunk=True): - """Construct a SequenceMatcher. +class SequenceMatcherBase: + def __init__(self, isjunk=None, a='', b=''): + """ Optional arg isjunk is None (the default), or a one-argument function that takes a sequence element and returns true iff the element is junk. None is equivalent to passing "lambda x: 0", i.e. @@ -137,60 +136,35 @@ def __init__(self, isjunk=None, a='', b='', autojunk=True): default, an empty string. The elements of b must be hashable. See also .set_seqs() and .set_seq2(). - Optional arg autojunk should be set to False to disable the - "automatic junk heuristic" that treats popular elements as junk - (see module documentation for more information). + Members: + a : Sequence + first sequence + b : Sequence + second sequence; differences are computed as "what do + we need to do to 'a' to change it into 'b'?" + isjunk : Callable | None + a user-supplied function taking a sequence element and + returning true iff the element is "junk" + "junk" elements are unmatchable elements + matching_blocks : list + a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; + ascending & non-overlapping in i and in j; terminated by + a dummy (len(a), len(b), 0) sentinel + opcodes : list + a list of (tag, i1, i2, j1, j2) tuples, where tag is + one of + 'replace' a[i1:i2] should be replaced by b[j1:j2] + 'delete' a[i1:i2] should be deleted + 'insert' b[j1:j2] should be inserted + 'equal' a[i1:i2] == b[j1:j2] """ - - # Members: - # a - # first sequence - # b - # second sequence; differences are computed as "what do - # we need to do to 'a' to change it into 'b'?" - # b2j - # for x in b, b2j[x] is a list of the indices (into b) - # at which x appears; junk and popular elements do not appear - # fullbcount - # for x in b, fullbcount[x] == the number of times x - # appears in b; only materialized if really needed (used - # only for computing quick_ratio()) - # matching_blocks - # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; - # ascending & non-overlapping in i and in j; terminated by - # a dummy (len(a), len(b), 0) sentinel - # opcodes - # a list of (tag, i1, i2, j1, j2) tuples, where tag is - # one of - # 'replace' a[i1:i2] should be replaced by b[j1:j2] - # 'delete' a[i1:i2] should be deleted - # 'insert' b[j1:j2] should be inserted - # 'equal' a[i1:i2] == b[j1:j2] - # isjunk - # a user-supplied function taking a sequence element and - # returning true iff the element is "junk" -- this has - # subtle but helpful effects on the algorithm, which I'll - # get around to writing up someday <0.9 wink>. - # DON'T USE! Only __chain_b uses this. Use "in self.bjunk". - # bjunk - # the items in b for which isjunk is True. - # bpopular - # nonjunk items in b treated as junk by the heuristic (if used). - self.isjunk = isjunk - self.a = self.b = None - self.autojunk = autojunk + self.a = None + self.b = None self.set_seqs(a, b) def set_seqs(self, a, b): - """Set the two sequences to be compared. - - >>> s = SequenceMatcher() - >>> s.set_seqs("abcd", "bcde") - >>> s.ratio() - 0.75 - """ - + """Set the two sequences to be compared.""" self.set_seq1(a) self.set_seq2(b) @@ -246,178 +220,28 @@ def set_seq2(self, b): self.b = b self.matching_blocks = self.opcodes = None self.fullbcount = None - self.__chain_b() - - # For each element x in b, set b2j[x] to a list of the indices in - # b where x appears; the indices are in increasing order; note that - # the number of times x appears in b is len(b2j[x]) ... - # when self.isjunk is defined, junk elements don't show up in this - # map at all, which stops the central find_longest_match method - # from starting any matching block at a junk element ... - # b2j also does not contain entries for "popular" elements, meaning - # elements that account for more than 1 + 1% of the total elements, and - # when the sequence is reasonably large (>= 200 elements); this can - # be viewed as an adaptive notion of semi-junk, and yields an enormous - # speedup when, e.g., comparing program files with hundreds of - # instances of "return NULL;" ... - # note that this is only called when b changes; so for cross-product - # kinds of matches, it's best to call set_seq2 once, then set_seq1 - # repeatedly - - def __chain_b(self): - # Because isjunk is a user-defined (not C) function, and we test - # for junk a LOT, it's important to minimize the number of calls. - # Before the tricks described here, __chain_b was by far the most - # time-consuming routine in the whole module! If anyone sees - # Jim Roskind, thank him again for profile.py -- I never would - # have guessed that. - # The first trick is to build b2j ignoring the possibility - # of junk. I.e., we don't call isjunk at all yet. Throwing - # out the junk later is much cheaper than building b2j "right" - # from the start. - b = self.b - self.b2j = b2j = {} - - for i, elt in enumerate(b): - indices = b2j.setdefault(elt, []) - indices.append(i) - - # Purge junk elements - self.bjunk = junk = set() - isjunk = self.isjunk - if isjunk: - for elt in b2j.keys(): - if isjunk(elt): - junk.add(elt) - for elt in junk: # separate loop avoids separate list of keys - del b2j[elt] - - # Purge popular elements that are not junk - self.bpopular = popular = set() - n = len(b) - if self.autojunk and n >= 200: - ntest = n // 100 + 1 - for elt, idxs in b2j.items(): - if len(idxs) > ntest: - popular.add(elt) - for elt in popular: # ditto; as fast for 1% deletion - del b2j[elt] - - def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): - """Find longest matching block in a[alo:ahi] and b[blo:bhi]. - - By default it will find the longest match in the entirety of a and b. - - If isjunk is not defined: - - Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where - alo <= i <= i+k <= ahi - blo <= j <= j+k <= bhi - and for all (i',j',k') meeting those conditions, - k >= k' - i <= i' - and if i == i', j <= j' - - In other words, of all maximal matching blocks, return one that - starts earliest in a, and of all those maximal matching blocks that - start earliest in a, return the one that starts earliest in b. - - >>> s = SequenceMatcher(None, " abcd", "abcd abcd") - >>> s.find_longest_match(0, 5, 0, 9) - Match(a=0, b=4, size=5) + self._prepare_seq2() - If isjunk is defined, first the longest matching block is - determined as above, but with the additional restriction that no - junk element appears in the block. Then that block is extended as - far as possible by matching (only) junk elements on both sides. So - the resulting block never matches on junk except as identical junk - happens to be adjacent to an "interesting" match. - - Here's the same example as before, but considering blanks to be - junk. That prevents " abcd" from matching the " abcd" at the tail - end of the second sequence directly. Instead only the "abcd" can - match, and matches the leftmost "abcd" in the second sequence: - - >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd") - >>> s.find_longest_match(0, 5, 0, 9) - Match(a=1, b=0, size=4) - - If no blocks match, return (alo, blo, 0). - - >>> s = SequenceMatcher(None, "ab", "c") - >>> s.find_longest_match(0, 2, 0, 1) - Match(a=0, b=0, size=0) + def _prepare_seq2(self): + """Preparation function that is called at the end of `set_seq2`. + It is usually used to: + a) Process junk + b) Pre-compile elligible parts of algorithm """ + pass - # CAUTION: stripping common prefix or suffix would be incorrect. - # E.g., - # ab - # acab - # Longest matching block is "ab", but if common prefix is - # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so - # strip, so ends up claiming that ab is changed to acab by - # inserting "ca" in the middle. That's minimal but unintuitive: - # "it's obvious" that someone inserted "ac" at the front. - # Windiff ends up at the same place as diff, but by pairing up - # the unique 'b's and then matching the first two 'a's. + # Abstract Methods ---------------- + # --------------------------------- - a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__ - if ahi is None: - ahi = len(a) - if bhi is None: - bhi = len(b) - besti, bestj, bestsize = alo, blo, 0 - # find longest junk-free match - # during an iteration of the loop, j2len[j] = length of longest - # junk-free match ending with a[i-1] and b[j] - j2len = {} - nothing = [] - for i in range(alo, ahi): - # look at all instances of a[i] in b; note that because - # b2j has no junk keys, the loop is skipped if a[i] is junk - j2lenget = j2len.get - newj2len = {} - for j in b2j.get(a[i], nothing): - # a[i] matches b[j] - if j < blo: - continue - if j >= bhi: - break - k = newj2len[j] = j2lenget(j-1, 0) + 1 - if k > bestsize: - besti, bestj, bestsize = i-k+1, j-k+1, k - j2len = newj2len + def _get_matching_blocks(self): + """Return list of triples describing matching subsequences. + Implement this to return list[tuple[int, int, int]] and + let `get_matching_blocks` take care of maintenance + """ + raise NotImplementedError - # Extend the best by non-junk elements on each end. In particular, - # "popular" non-junk elements aren't in b2j, which greatly speeds - # the inner loop above, but also means "the best" match so far - # doesn't contain any junk *or* popular non-junk elements. - while besti > alo and bestj > blo and \ - not isbjunk(b[bestj-1]) and \ - a[besti-1] == b[bestj-1]: - besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 - while besti+bestsize < ahi and bestj+bestsize < bhi and \ - not isbjunk(b[bestj+bestsize]) and \ - a[besti+bestsize] == b[bestj+bestsize]: - bestsize += 1 - - # Now that we have a wholly interesting match (albeit possibly - # empty!), we may as well suck up the matching junk on each - # side of it too. Can't think of a good reason not to, and it - # saves post-processing the (possibly considerable) expense of - # figuring out what to do with it. In the case of an empty - # interesting match, this is clearly the right thing to do, - # because no other kind of match is possible in the regions. - while besti > alo and bestj > blo and \ - isbjunk(b[bestj-1]) and \ - a[besti-1] == b[bestj-1]: - besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 - while besti+bestsize < ahi and bestj+bestsize < bhi and \ - isbjunk(b[bestj+bestsize]) and \ - a[besti+bestsize] == b[bestj+bestsize]: - bestsize = bestsize + 1 - - return Match(besti, bestj, bestsize) + # Implemented Methods ------------- + # --------------------------------- def get_matching_blocks(self): """Return list of triples describing matching subsequences. @@ -431,64 +255,23 @@ def get_matching_blocks(self): blocks. The last triple is a dummy, (len(a), len(b), 0), and is the only - triple with n==0. + triple with n==0. - >>> s = SequenceMatcher(None, "abxcd", "abcd") - >>> list(s.get_matching_blocks()) - [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)] + When `_get_matching_blocks` is implemented, this method takes care of: + 1. Appending last dummy tripple + 2. Collapsing adjacent blocks + 3. Caching """ - - if self.matching_blocks is not None: - return self.matching_blocks - la, lb = len(self.a), len(self.b) - - # This is most naturally expressed as a recursive algorithm, but - # at least one user bumped into extreme use cases that exceeded - # the recursion limit on their box. So, now we maintain a list - # ('queue`) of blocks we still need to look at, and append partial - # results to `matching_blocks` in a loop; the matches are sorted - # at the end. - queue = [(0, la, 0, lb)] - matching_blocks = [] - while queue: - alo, ahi, blo, bhi = queue.pop() - i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) - # a[alo:i] vs b[blo:j] unknown - # a[i:i+k] same as b[j:j+k] - # a[i+k:ahi] vs b[j+k:bhi] unknown - if k: # if k is 0, there was no matching block - matching_blocks.append(x) - if alo < i and blo < j: - queue.append((alo, i, blo, j)) - if i+k < ahi and j+k < bhi: - queue.append((i+k, ahi, j+k, bhi)) - matching_blocks.sort() - - # It's possible that we have adjacent equal blocks in the - # matching_blocks list now. Starting with 2.5, this code was added - # to collapse them. - i1 = j1 = k1 = 0 - non_adjacent = [] - for i2, j2, k2 in matching_blocks: - # Is this block adjacent to i1, j1, k1? - if i1 + k1 == i2 and j1 + k1 == j2: - # Yes, so collapse them -- this just increases the length of - # the first block by the length of the second, and the first - # block so lengthened remains the block to compare against. - k1 += k2 - else: - # Not adjacent. Remember the first block (k1==0 means it's - # the dummy we started with), and make the second block the - # new block to compare against. - if k1: - non_adjacent.append((i1, j1, k1)) - i1, j1, k1 = i2, j2, k2 - if k1: - non_adjacent.append((i1, j1, k1)) - - non_adjacent.append( (la, lb, 0) ) - self.matching_blocks = list(map(Match._make, non_adjacent)) - return self.matching_blocks + blocks = self.matching_blocks + if blocks is None: + blocks = self._get_matching_blocks() + blocks = _collapse_adjacent_blocks(blocks) + blocks = list(map(Match._make, blocks)) + # Append dummy at the end + blocks.append(Match(len(self.a), len(self.b), 0)) + # Cache + self.matching_blocks = blocks + return blocks def get_opcodes(self): """Return list of 5-tuples describing how to turn a into b. @@ -598,73 +381,417 @@ def get_grouped_opcodes(self, n=3): def ratio(self): """Return a measure of the sequences' similarity (float in [0,1]). - Where T is the total number of elements in both sequences, and - M is the number of matches, this is 2.0*M / T. - Note that this is 1 if the sequences are identical, and 0 if - they have nothing in common. + Where T is the total number of elements in both sequences, and + M is the number of matches, this is 2.0*M / T. + Note that this is 1 if the sequences are identical, and 0 if + they have nothing in common. + + .ratio() is expensive to compute if you haven't already computed + .get_matching_blocks() or .get_opcodes(), in which case you may + want to try .quick_ratio() or .real_quick_ratio() first to get an + upper bound. + + >>> s = SequenceMatcher(None, "abcd", "bcde") + >>> s.ratio() + 0.75 + >>> s.quick_ratio() + 0.75 + >>> s.real_quick_ratio() + 1.0 + """ + + matches = sum(triple[-1] for triple in self.get_matching_blocks()) + return _calculate_ratio(matches, len(self.a) + len(self.b)) + + def quick_ratio(self): + """Return an upper bound on ratio() relatively quickly. + + This isn't defined beyond that it is an upper bound on .ratio(), and + is faster to compute. + """ + + # viewing a and b as multisets, set matches to the cardinality + # of their intersection; this counts the number of matches + # without regard to order, so is clearly an upper bound + if self.fullbcount is None: + self.fullbcount = fullbcount = {} + for elt in self.b: + fullbcount[elt] = fullbcount.get(elt, 0) + 1 + fullbcount = self.fullbcount + # avail[x] is the number of times x appears in 'b' less the + # number of times we've seen it in 'a' so far ... kinda + avail = {} + matches = 0 + for elt in self.a: + if elt in avail: + numb = avail[elt] + else: + numb = fullbcount.get(elt, 0) + avail[elt] = numb - 1 + if numb > 0: + matches += 1 + return _calculate_ratio(matches, len(self.a) + len(self.b)) + + def real_quick_ratio(self): + """Return an upper bound on ratio() very quickly. + + This isn't defined beyond that it is an upper bound on .ratio(), and + is faster to compute than either .ratio() or .quick_ratio(). + """ + + la, lb = len(self.a), len(self.b) + # can't have more matches than the number of elements in the + # shorter sequence + return _calculate_ratio(min(la, lb), la + lb) + + def ratio_if_above(self, cutoff, equal_ok=False): + """Returns ratio if it is higher than cutoff. + Otherwise, returns None. + Note, this is the main ratio function that is used by applications + in this module. + """ + rqr = self.real_quick_ratio() + if equal_ok: + if rqr >= cutoff and self.quick_ratio() >= cutoff: + ratio = self.ratio() + if ratio >= cutoff: + return ratio + else: + if rqr > cutoff and self.quick_ratio() > cutoff: + ratio = self.ratio() + if ratio > cutoff: + return ratio + + __class_getitem__ = classmethod(GenericAlias) + + +######################################################################## +### SequenceMatcher +######################################################################## + + +class SequenceMatcher(SequenceMatcherBase): + + """ + SequenceMatcher is a flexible class for comparing pairs of sequences of + any type, so long as the sequence elements are hashable. The basic + algorithm predates, and is a little fancier than, an algorithm + published in the late 1980's by Ratcliff and Obershelp under the + hyperbolic name "gestalt pattern matching". The basic idea is to find + the longest contiguous matching subsequence that contains no "junk" + elements (R-O doesn't address junk). The same idea is then applied + recursively to the pieces of the sequences to the left and to the right + of the matching subsequence. This does not yield minimal edit + sequences, but does tend to yield matches that "look right" to people. + + SequenceMatcher tries to compute a "human-friendly diff" between two + sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the + longest *contiguous* & junk-free matching subsequence. That's what + catches peoples' eyes. The Windows(tm) windiff has another interesting + notion, pairing up elements that appear uniquely in each sequence. + That, and the method here, appear to yield more intuitive difference + reports than does diff. This method appears to be the least vulnerable + to syncing up on blocks of "junk lines", though (like blank lines in + ordinary text files, or maybe "

" lines in HTML files). That may be + because this is the only method of the 3 that has a *concept* of + "junk" . + + Example, comparing two strings, and considering blanks to be "junk": + + >>> s = SequenceMatcher(lambda x: x == " ", + ... "private Thread currentThread;", + ... "private volatile Thread currentThread;") + >>> + + .ratio() returns a float in [0, 1], measuring the "similarity" of the + sequences. As a rule of thumb, a .ratio() value over 0.6 means the + sequences are close matches: + + >>> print(round(s.ratio(), 2)) + 0.87 + >>> + + If you're only interested in where the sequences match, + .get_matching_blocks() is handy: + + >>> for block in s.get_matching_blocks(): + ... print("a[%d] and b[%d] match for %d elements" % block) + a[0] and b[0] match for 8 elements + a[8] and b[17] match for 21 elements + a[29] and b[38] match for 0 elements + + Note that the last tuple returned by .get_matching_blocks() is always a + dummy, (len(a), len(b), 0), and this is the only case in which the last + tuple element (number of elements matched) is 0. + + If you want to know how to change the first sequence into the second, + use .get_opcodes(): + + >>> for opcode in s.get_opcodes(): + ... print("%6s a[%d:%d] b[%d:%d]" % opcode) + equal a[0:8] b[0:8] + insert a[8:8] b[8:17] + equal a[8:29] b[17:38] + + See the Differ class for a fancy human-friendly file differencer, which + uses SequenceMatcher both to compare sequences of lines, and to compare + sequences of characters within similar (near-matching) lines. + + See also function get_close_matches() in this module, which shows how + simple code building on SequenceMatcher can be used to do useful work. + + Timing: Basic R-O is cubic time worst case and quadratic time expected + case. SequenceMatcher is quadratic time for the worst case and has + expected-case behavior dependent in a complicated way on how many + elements the sequences have in common; best case time is linear. + """ + + def __init__(self, isjunk=None, a='', b='', autojunk=True): + """Construct a SequenceMatcher. + + Optional arg autojunk should be set to False to disable the + "automatic junk heuristic" that treats popular elements as junk + (see module documentation for more information). + """ + + # Members specific to Sequence Matcher: + # b2j + # for x in b, b2j[x] is a list of the indices (into b) + # at which x appears; junk and popular elements do not appear + # fullbcount + # for x in b, fullbcount[x] == the number of times x + # appears in b; only materialized if really needed (used + # only for computing quick_ratio()) + # isjunk + # a user-supplied function taking a sequence element and + # returning true iff the element is "junk" -- this has + # subtle but helpful effects on the algorithm, which I'll + # get around to writing up someday <0.9 wink>. + # DON'T USE! Only __chain_b uses this. Use "in self.bjunk". + # bjunk + # the items in b for which isjunk is True. + # bpopular + # nonjunk items in b treated as junk by the heuristic (if used). + self.autojunk = autojunk + super().__init__(isjunk, a, b) + + # For each element x in b, set b2j[x] to a list of the indices in + # b where x appears; the indices are in increasing order; note that + # the number of times x appears in b is len(b2j[x]) ... + # when self.isjunk is defined, junk elements don't show up in this + # map at all, which stops the central find_longest_match method + # from starting any matching block at a junk element ... + # b2j also does not contain entries for "popular" elements, meaning + # elements that account for more than 1 + 1% of the total elements, and + # when the sequence is reasonably large (>= 200 elements); this can + # be viewed as an adaptive notion of semi-junk, and yields an enormous + # speedup when, e.g., comparing program files with hundreds of + # instances of "return NULL;" ... + # note that this is only called when b changes; so for cross-product + # kinds of matches, it's best to call set_seq2 once, then set_seq1 + # repeatedly + + def _prepare_seq2(self): + self.__chain_b() + + def __chain_b(self): + # Because isjunk is a user-defined (not C) function, and we test + # for junk a LOT, it's important to minimize the number of calls. + # Before the tricks described here, __chain_b was by far the most + # time-consuming routine in the whole module! If anyone sees + # Jim Roskind, thank him again for profile.py -- I never would + # have guessed that. + # The first trick is to build b2j ignoring the possibility + # of junk. I.e., we don't call isjunk at all yet. Throwing + # out the junk later is much cheaper than building b2j "right" + # from the start. + b = self.b + self.b2j = b2j = {} + + for i, elt in enumerate(b): + indices = b2j.setdefault(elt, []) + indices.append(i) + + # Purge junk elements + self.bjunk = junk = set() + isjunk = self.isjunk + if isjunk: + for elt in b2j.keys(): + if isjunk(elt): + junk.add(elt) + for elt in junk: # separate loop avoids separate list of keys + del b2j[elt] + + # Purge popular elements that are not junk + self.bpopular = popular = set() + n = len(b) + if self.autojunk and n >= 200: + ntest = n // 100 + 1 + for elt, idxs in b2j.items(): + if len(idxs) > ntest: + popular.add(elt) + for elt in popular: # ditto; as fast for 1% deletion + del b2j[elt] + + def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): + """Find longest matching block in a[alo:ahi] and b[blo:bhi]. + + By default it will find the longest match in the entirety of a and b. + + If isjunk is not defined: + + Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where + alo <= i <= i+k <= ahi + blo <= j <= j+k <= bhi + and for all (i',j',k') meeting those conditions, + k >= k' + i <= i' + and if i == i', j <= j' + + In other words, of all maximal matching blocks, return one that + starts earliest in a, and of all those maximal matching blocks that + start earliest in a, return the one that starts earliest in b. + + >>> s = SequenceMatcher(None, " abcd", "abcd abcd") + >>> s.find_longest_match(0, 5, 0, 9) + Match(a=0, b=4, size=5) + + If isjunk is defined, first the longest matching block is + determined as above, but with the additional restriction that no + junk element appears in the block. Then that block is extended as + far as possible by matching (only) junk elements on both sides. So + the resulting block never matches on junk except as identical junk + happens to be adjacent to an "interesting" match. + + Here's the same example as before, but considering blanks to be + junk. That prevents " abcd" from matching the " abcd" at the tail + end of the second sequence directly. Instead only the "abcd" can + match, and matches the leftmost "abcd" in the second sequence: + + >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd") + >>> s.find_longest_match(0, 5, 0, 9) + Match(a=1, b=0, size=4) - .ratio() is expensive to compute if you haven't already computed - .get_matching_blocks() or .get_opcodes(), in which case you may - want to try .quick_ratio() or .real_quick_ratio() first to get an - upper bound. + If no blocks match, return (alo, blo, 0). - >>> s = SequenceMatcher(None, "abcd", "bcde") - >>> s.ratio() - 0.75 - >>> s.quick_ratio() - 0.75 - >>> s.real_quick_ratio() - 1.0 + >>> s = SequenceMatcher(None, "ab", "c") + >>> s.find_longest_match(0, 2, 0, 1) + Match(a=0, b=0, size=0) """ - matches = sum(triple[-1] for triple in self.get_matching_blocks()) - return _calculate_ratio(matches, len(self.a) + len(self.b)) + # CAUTION: stripping common prefix or suffix would be incorrect. + # E.g., + # ab + # acab + # Longest matching block is "ab", but if common prefix is + # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so + # strip, so ends up claiming that ab is changed to acab by + # inserting "ca" in the middle. That's minimal but unintuitive: + # "it's obvious" that someone inserted "ac" at the front. + # Windiff ends up at the same place as diff, but by pairing up + # the unique 'b's and then matching the first two 'a's. - def quick_ratio(self): - """Return an upper bound on ratio() relatively quickly. + a, b, b2j, bjunk = self.a, self.b, self.b2j, self.bjunk + if ahi is None: + ahi = len(a) + if bhi is None: + bhi = len(b) + besti, bestj, bestsize = alo, blo, 0 + # find longest junk-free match + # during an iteration of the loop, j2len[j] = length of longest + # junk-free match ending with a[i-1] and b[j] + j2len = {} + nothing = [] + for i in range(alo, ahi): + # look at all instances of a[i] in b; note that because + # b2j has no junk keys, the loop is skipped if a[i] is junk + j2lenget = j2len.get + newj2len = {} + for j in b2j.get(a[i], nothing): + # a[i] matches b[j] + if j < blo: + continue + if j >= bhi: + break + k = newj2len[j] = j2lenget(j-1, 0) + 1 + if k > bestsize: + besti, bestj, bestsize = i-k+1, j-k+1, k + j2len = newj2len - This isn't defined beyond that it is an upper bound on .ratio(), and - is faster to compute. - """ + block = besti, bestj, bestsize + # [2026-02-07@dgpb]: Note, expanding will happen even when no-match + if self.autojunk: + # Extend the best by non-junk elements on each end. In particular, + # "popular" non-junk elements aren't in b2j, which greatly speeds + # the inner loop above, but also means "the best" match so far + # doesn't contain any junk *or* popular non-junk elements. + block = _expand_block_to_junk( + bjunk, block, a, b, alo, ahi, blo, bhi, inverse=True) + + if bjunk: + # Now that we have a wholly interesting match (albeit possibly + # empty!), we may as well suck up the matching junk on each + # side of it too. Can't think of a good reason not to, and it + # saves post-processing the (possibly considerable) expense of + # figuring out what to do with it. In the case of an empty + # interesting match, this is clearly the right thing to do, + # because no other kind of match is possible in the regions. + block = _expand_block_to_junk( + bjunk, block, a, b, alo, ahi, blo, bhi, inverse=False) + + return Match._make(block) + + def _get_matching_blocks(self): + """Return list of triples describing matching subsequences. - # viewing a and b as multisets, set matches to the cardinality - # of their intersection; this counts the number of matches - # without regard to order, so is clearly an upper bound - if self.fullbcount is None: - self.fullbcount = fullbcount = {} - for elt in self.b: - fullbcount[elt] = fullbcount.get(elt, 0) + 1 - fullbcount = self.fullbcount - # avail[x] is the number of times x appears in 'b' less the - # number of times we've seen it in 'a' so far ... kinda - avail = {} - matches = 0 - for elt in self.a: - if elt in avail: - numb = avail[elt] - else: - numb = fullbcount.get(elt, 0) - avail[elt] = numb - 1 - if numb > 0: - matches += 1 - return _calculate_ratio(matches, len(self.a) + len(self.b)) + Each triple is of the form (i, j, n), and means that + a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in + i and in j. New in Python 2.5, it's also guaranteed that if + (i, j, n) and (i', j', n') are adjacent triples in the list, and + the second is not the last triple in the list, then i+n != i' or + j+n != j'. IOW, adjacent triples never describe adjacent equal + blocks. - def real_quick_ratio(self): - """Return an upper bound on ratio() very quickly. + The last triple is a dummy, (len(a), len(b), 0), and is the only + triple with n==0. - This isn't defined beyond that it is an upper bound on .ratio(), and - is faster to compute than either .ratio() or .quick_ratio(). + >>> s = SequenceMatcher(None, "abxcd", "abcd") + >>> list(s.get_matching_blocks()) + [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)] """ la, lb = len(self.a), len(self.b) - # can't have more matches than the number of elements in the - # shorter sequence - return _calculate_ratio(min(la, lb), la + lb) - __class_getitem__ = classmethod(GenericAlias) + # This is most naturally expressed as a recursive algorithm, but + # at least one user bumped into extreme use cases that exceeded + # the recursion limit on their box. So, now we maintain a list + # ('queue`) of blocks we still need to look at, and append partial + # results to `matching_blocks` in a loop; the matches are sorted + # at the end. + queue = [(0, la, 0, lb)] + matching_blocks = [] + while queue: + alo, ahi, blo, bhi = queue.pop() + i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) + # a[alo:i] vs b[blo:j] unknown + # a[i:i+k] same as b[j:j+k] + # a[i+k:ahi] vs b[j+k:bhi] unknown + if k: # if k is 0, there was no matching block + matching_blocks.append(x) + if alo < i and blo < j: + queue.append((alo, i, blo, j)) + if i+k < ahi and j+k < bhi: + queue.append((i+k, ahi, j+k, bhi)) + matching_blocks.sort() + return matching_blocks + + +######################################################################## +### get_close_matches +######################################################################## -def get_close_matches(word, possibilities, n=3, cutoff=0.6): +def get_close_matches(word, possibilities, n=3, cutoff=0.6, matcher=None): """Use SequenceMatcher to return list of the best "good enough" matches. word is a sequence for which close matches are desired (typically a @@ -679,6 +806,10 @@ def get_close_matches(word, possibilities, n=3, cutoff=0.6): Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities that don't score at least that similar to word are ignored. + Optional arg matcher is a callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + The best (no more than n) matches among the possibilities are returned in a list, sorted by similarity score, most similar first. @@ -697,16 +828,19 @@ def get_close_matches(word, possibilities, n=3, cutoff=0.6): raise ValueError("n must be > 0: %r" % (n,)) if not 0.0 <= cutoff <= 1.0: raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) + if matcher is None: + matcher = SequenceMatcher + elif not callable(matcher): + raise TypeError("matcher must be callable: %r" % (matcher,)) result = [] - s = SequenceMatcher() + s = matcher() s.set_seq2(word) + set_seq1 = s.set_seq1 + ratio_if_above = s.ratio_if_above for x in possibilities: - s.set_seq1(x) - if s.real_quick_ratio() < cutoff or s.quick_ratio() < cutoff: - continue - - ratio = s.ratio() - if ratio >= cutoff: + set_seq1(x) + ratio = ratio_if_above(cutoff, equal_ok=True) + if ratio is not None: result.append((ratio, x)) # Move the best scorers to head of list @@ -715,6 +849,11 @@ def get_close_matches(word, possibilities, n=3, cutoff=0.6): return [x for score, x in result] +######################################################################## +### Differ +######################################################################## + + def _keep_original_ws(s, tag_s): """Replace whitespace with the original whitespace characters in `s`""" return ''.join( @@ -723,7 +862,6 @@ def _keep_original_ws(s, tag_s): ) - class Differ: r""" Differ is a class for comparing sequences of lines of text, and @@ -810,7 +948,8 @@ class Differ: + 5. Flat is better than nested. """ - def __init__(self, linejunk=None, charjunk=None): + def __init__(self, linejunk=None, charjunk=None, + linematcher=None, charmatcher=None): """ Construct a text differencer, with optional filters. @@ -828,10 +967,27 @@ def __init__(self, linejunk=None, charjunk=None): module-level function `IS_CHARACTER_JUNK` may be used to filter out whitespace characters (a blank or tab; **note**: bad idea to include newline in this!). Use of IS_CHARACTER_JUNK is recommended. - """ + - `linematcher`: callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + + - `charmatcher`: callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + """ + if linematcher is None: + linematcher = SequenceMatcher + elif not callable(linematcher): + raise TypeError("linematcher must be callable: %r" % (linematcher,)) + if charmatcher is None: + charmatcher = SequenceMatcher + elif not callable(charmatcher): + raise TypeError("charmatcher must be callable: %r" % (charmatcher,)) self.linejunk = linejunk self.charjunk = charjunk + self.linematcher = linematcher + self.charmatcher = charmatcher def compare(self, a, b): r""" @@ -859,7 +1015,7 @@ def compare(self, a, b): + emu """ - cruncher = SequenceMatcher(self.linejunk, a, b) + cruncher = self.linematcher(self.linejunk, a, b) for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): if tag == 'replace': g = self._fancy_replace(a, alo, ahi, b, blo, bhi) @@ -920,10 +1076,9 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): # Later, more pathological cases prompted removing recursion # entirely. cutoff = 0.74999 - cruncher = SequenceMatcher(self.charjunk) - crqr = cruncher.real_quick_ratio - cqr = cruncher.quick_ratio - cr = cruncher.ratio + cruncher = self.charmatcher(self.charjunk) + set_seq1 = cruncher.set_seq1 + ratio_if_above = cruncher.ratio_if_above WINDOW = 10 best_i = best_j = None @@ -939,13 +1094,12 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi): break best_ratio = cutoff for i in arange: - cruncher.set_seq1(a[i]) + set_seq1(a[i]) # Ordering by cheapest to most expensive ratio is very # valuable, most often getting out early. - if (crqr() > best_ratio - and cqr() > best_ratio - and cr() > best_ratio): - best_i, best_j, best_ratio = i, j, cr() + ratio = ratio_if_above(best_ratio, equal_ok=False) + if ratio is not None: + best_i, best_j, best_ratio = i, j, ratio if best_i is None: # found nothing to synch on yet - move to next j @@ -1097,7 +1251,8 @@ def _format_range_unified(start, stop): return '{},{}'.format(beginning, length) def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', - tofiledate='', n=3, lineterm='\n', *, color=False): + tofiledate='', n=3, lineterm='\n', *, color=False, + matcher=None): r""" Compare two sequences of lines; generate the delta as a unified diff. @@ -1118,6 +1273,10 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', 'git diff --color'. Even if enabled, it can be controlled using environment variables such as 'NO_COLOR'. + Optional arg matcher is a callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + The unidiff format normally has a header for filenames and modification times. Any or all of these may be specified using strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. @@ -1140,6 +1299,10 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', +tree four """ + if matcher is None: + matcher = SequenceMatcher + elif not callable(matcher): + raise TypeError("matcher must be callable: %r" % (matcher,)) if color and can_colorize(): t = get_theme(force_color=True).difflib @@ -1148,7 +1311,7 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) started = False - for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): + for group in matcher(None, a, b).get_grouped_opcodes(n): if not started: started = True fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' @@ -1190,8 +1353,8 @@ def _format_range_context(start, stop): return '{},{}'.format(beginning, beginning + length - 1) # See http://www.unix.org/single_unix_specification/ -def context_diff(a, b, fromfile='', tofile='', - fromfiledate='', tofiledate='', n=3, lineterm='\n'): +def context_diff(a, b, fromfile='', tofile='', fromfiledate='', tofiledate='', + n=3, lineterm='\n', matcher=None): r""" Compare two sequences of lines; generate the delta as a context diff. @@ -1208,6 +1371,10 @@ def context_diff(a, b, fromfile='', tofile='', For inputs that do not have trailing newlines, set the lineterm argument to "" so that the output will be uniformly newline free. + Optional arg matcher is a callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + The context diff format normally has a header for filenames and modification times. Any or all of these may be specified using strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. @@ -1233,11 +1400,15 @@ def context_diff(a, b, fromfile='', tofile='', ! tree four """ + if matcher is None: + matcher = SequenceMatcher + elif not callable(matcher): + raise TypeError("matcher must be callable: %r" % (matcher,)) _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ') started = False - for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): + for group in matcher(None, a, b).get_grouped_opcodes(n): if not started: started = True fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' @@ -1319,7 +1490,8 @@ def decode(s): for line in lines: yield line.encode('ascii', 'surrogateescape') -def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): +def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK, + linematcher=None, charmatcher=None): r""" Compare `a` and `b` (lists of strings); return a `Differ`-style delta. @@ -1337,6 +1509,14 @@ def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): whitespace characters (a blank or tab; note: it's a bad idea to include newline in this!). + - `linematcher`: callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + + - `charmatcher`: callable that takes 3 positional arguments. + i.e. matcher(isjunk, a, b) which returns SequenceMatcherBase instance + Default (if None) is SequenceMatcher class. + Tools/scripts/ndiff.py is a command-line front-end to this function. Example: @@ -1354,10 +1534,11 @@ def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): + tree + emu """ - return Differ(linejunk, charjunk).compare(a, b) + return Differ(linejunk, charjunk, linematcher, charmatcher).compare(a, b) -def _mdiff(fromlines, tolines, context=None, linejunk=None, - charjunk=IS_CHARACTER_JUNK): +def _mdiff(fromlines, tolines, context=None, + linejunk=None, charjunk=IS_CHARACTER_JUNK, + linematcher=None, charmatcher=None): r"""Returns generator yielding marked up from/to side by side differences. Arguments: @@ -1367,6 +1548,8 @@ def _mdiff(fromlines, tolines, context=None, linejunk=None, if None, all from/to text lines will be generated. linejunk -- passed on to ndiff (see ndiff documentation) charjunk -- passed on to ndiff (see ndiff documentation) + linematcher -- passed on to ndiff (see ndiff documentation) + charmatcher -- passed on to ndiff (see ndiff documentation) This function returns an iterator which returns a tuple: (from line tuple, to line tuple, boolean flag) @@ -1396,7 +1579,8 @@ def _mdiff(fromlines, tolines, context=None, linejunk=None, change_re = re.compile(r'(\++|\-+|\^+)') # create the difference iterator to generate the differences - diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk) + diff_lines_iterator = ndiff(fromlines, tolines, linejunk, charjunk, + linematcher, charmatcher) def _make_line(lines, format_key, side, num_lines=[0,0]): """Returns line of text with user's change markup and line formatting. @@ -1626,6 +1810,11 @@ def _line_pair_iterator(): return +######################################################################## +### HtmlDiff +######################################################################## + + _file_template = """ @@ -1735,22 +1924,26 @@ class HtmlDiff(object): _legend = _legend _default_prefix = 0 - def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None, - charjunk=IS_CHARACTER_JUNK): + def __init__(self,tabsize=8, wrapcolumn=None, + linejunk=None, charjunk=IS_CHARACTER_JUNK, + linematcher=None, charmatcher=None): """HtmlDiff instance initializer Arguments: tabsize -- tab stop spacing, defaults to 8. wrapcolumn -- column number where lines are broken and wrapped, defaults to None where lines are not wrapped. - linejunk,charjunk -- keyword arguments passed into ndiff() (used by - HtmlDiff() to generate the side by side HTML differences). See - ndiff() documentation for argument default values and descriptions. + linejunk,charjunk,linematcher,charmatcher -- keyword arguments + passed into ndiff() (used by HtmlDiff() to generate the side + by side HTML differences). See ndiff() documentation for + argument default values and descriptions. """ self._tabsize = tabsize self._wrapcolumn = wrapcolumn self._linejunk = linejunk self._charjunk = charjunk + self._linematcher = linematcher + self._charmatcher = charmatcher def make_file(self, fromlines, tolines, fromdesc='', todesc='', context=False, numlines=5, *, charset='utf-8'): @@ -2021,8 +2214,9 @@ def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False, context_lines = numlines else: context_lines = None - diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk, - charjunk=self._charjunk) + diffs = _mdiff(fromlines, tolines, context_lines, + linejunk=self._linejunk, charjunk=self._charjunk, + linematcher=self._linematcher, charmatcher=self._charmatcher) # set up iterator to wrap lines that exceed desired width if self._wrapcolumn: @@ -2099,3 +2293,846 @@ def restore(delta, which): for line in delta: if line[:2] in prefixes: yield line[2:] + + +######################################################################## +### _LCSUBAutomaton +######################################################################## + + +class _LCSUBAutomaton: + """Suffix Automaton for finding longest common substring. + + Complexity: + T: O(n1 + n2) ~ n1 + 5 × n2 + S: O(n2) : max_nstates = 2 × n if n <= 1 else 2 × n - 1 + n1 = len(seq1) - the one that is being scanned + n2 = len(seq2) - the one that is being built + + Node Structure: + nodes: [ + lengths: list[int], # length of a match + links: list[int], # link to roll back on mismatch + next1s: list[object], # See "Next logic below" + next2s: list[int], # See "Next logic below" + eposs: list[int], # index of last match position + ] + + Next logic (Memory optimization as > 50% of state have only 1 transition): + next2 == -1 -> empty + next2 == -3 -> next1: dict + next2 >= 0 -> next2 - index, next1 - key + + Examples: + >>> aut = _LCSUBAutomaton('abc') + >>> aut + <_LCSUBAutomaton object; seq2_size=3> + >>> aut.build() + >>> aut.print_states() + 0 (0, 0, {'a': 1, 'b': 2, 'c': 3}, -3, 0) + 1 (1, 0, 'b', 2, 0) + 2 (2, 0, 'c', 3, 1) + 3 (3, 0, None, -1, 2) + """ + def __init__(self, seq2, *, junk=()): + """ + Args: + seq2 : Sequence + Automaton will be built for this sequence. + Note, building is ~5x slower than scanning + junk : Iterable + Items in this set will be treated as unmatchable elements + """ + if not isinstance(junk, frozenset): + junk = frozenset(junk) + self.seq2 = seq2 + self.size2 = len(seq2) + self.junk = junk + self.nodes = None + self.cache = (0, 0) + + def __repr__(self): + kwstring = f'seq2_size={self.size2}' + if self.junk: + kwstring += f', junk_size={len(self.junk)}' + return f'<{type(self).__name__} object; {kwstring}>' + + # API ----------------------------- + # --------------------------------- + + def print_states(self, slc=slice(None)): + assert isinstance(slc, slice) + nodes = self.nodes + if nodes is None: + nodes = self.build(0, self.size2) + if slc != slice(None): + nodes = [item[slc] for item in nodes] + for i, state in enumerate(zip(*nodes)): + print(i, state) + + def build(self, start2=0, stop2=None): + """Build automaton for specified range of seq2""" + start2, stop2 = _adjust_indices(self.size2, start2, stop2) + key = (start2, stop2) + if self.cache != key: + self.nodes = None + self.key = (0, 0) + self.nodes = self._build(start2, stop2) + self.cache = key + + def findall(self, seq1, start1=0, stop1=None, start2=0, stop2=None, *, + mink=1, maxk=None, maximal=False): + """Find all common substrings from single O(n) scan + Args: + mink : int + filter out shorter length matches + maxk : int + filter out longer length matches + maximal : bool + Example: 2 sequences: seq2 = 'abcdef', seq1 = 'defabc' + These are matches for each iteration: + 1. 'd' + 2. 'de' + 3. 'def' + 4. 'a' + 5. 'ab' + 6. 'abc' + If maximal is True, then it will only include 'def' and `abc` + """ + if maxk is None: + maxk = _MAXSIZE + if not 0 < mink <= maxk: + raise ValueError(f'not 0 < {mink=} <= {maxk=}') + start1, stop1 = _adjust_indices(len(seq1), start1, stop1) + start2, stop2 = _adjust_indices(self.size2, start2, stop2) + if start1 >= stop1 or start2 >= stop2: + return + + if self.cache != (start2, stop2): + self.build(start2, stop2) + + it = self._finditer(seq1, start1, stop1) + if not maximal: + for block in it: + k = block[2] + if mink <= k and (maxk is None or k <= maxk): + one_mk = 1 - k + yield (block[0] + one_mk, block[1] + one_mk, k) + else: + for last in it: + break + else: + return + k = last[2] + for block in it: + if block[2] < k: + if mink <= k and (maxk is None or k <= maxk): + one_mk = 1 - k + yield (last[0] + one_mk, last[1] + one_mk, k) + last = block + k = last[2] + if mink <= k and (maxk is None or k <= maxk): + one_mk = 1 - k + yield (last[0] + one_mk, last[1] + one_mk, k) + + def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None): + """Find leftmost longest match + Firstly, it will be leftmost in seq1 + Secondly, it will be leftmost in seq2 if more than one occurrence + + Returns: + match: (start_in_seq1, start_in_seq2, match_length) + """ + start1, stop1 = _adjust_indices(len(seq1), start1, stop1) + start2, stop2 = _adjust_indices(self.size2, start2, stop2) + res = self._try_find(seq1, start1, stop1, start2, stop2) + if res is None: + res = self._find(seq1, start1, stop1, start2, stop2) + return res + + def batchfind(self, seq1, bounds_list): + """Performance method for many `find` calls + It calls `find` in order that aims to minimize builds needed + Also, does not evaluate same range twice + Args: + bounds_list : list[tuple[int, int, int, int]] + list of tuples: (start1, stop1, start2, stop2) + """ + if not bounds_list: + return [] + + result = [None] * len(bounds_list) + c_lo, c_hi = self.cache + jobs = list(enumerate(bounds_list)) + jobs.sort(key=lambda x: abs((b := x[1])[2] - c_lo) + abs(b[3] - c_hi)) + evaluated = {} + for i, bounds in jobs: + res = evaluated.get(bounds) + if res is None: + res = self.find(seq1, *bounds) + evaluated[bounds] = res + result[i] = res + return result + + # Private API --------------------- + # --------------------------------- + + def _try_find(self, seq1, start1, stop1, start2, stop2): + """Attempts to find match without building + Querying in exactly the same range will always succeed + Also, it might be possible if (start2, stop2) is within cached range + + returns None on fail + """ + if start1 >= stop1 or start2 >= stop2: + return (start1, start2, 0) + + c_start, c_stop = self.cache + if c_start <= start2 and stop2 <= c_stop: + it = self._finditer(seq1, start1, stop1, best=True) + for res in it: + break + else: + return (start1, start2, 0) + + e1, e2, k = res + stop_in_seq2 = e2 + 1 + start_in_seq2 = stop_in_seq2 - k + if start_in_seq2 >= start2 and stop_in_seq2 <= stop2: + return (e1 + 1 - k, start_in_seq2, k) + + def _find(self, seq1, start1, stop1, start2, stop2): + """Returns lefmost longest match + Does not attempt to retrieve from inexactly built range + Always returns an answer + """ + if start1 >= stop1 or start2 >= stop2: + return (start1, start2, 0) + + if self.cache != (start2, stop2): + self.build(start2, stop2) + + it = self._finditer(seq1, start1, stop1, best=True) + for res in it: + break + else: + return (start1, start2, 0) + + e1, e2, k = res + one_mk = 1 - k + return (e1 + one_mk, e2 + one_mk, k) + + # CORE ---------------------------- + # --------------------------------- + + def _make_nodes(self, n): + if n <= 0: + raise ValueError(f'{n=} <= 0') + lengths = [0] * n + links = [0] * n + next1s = [None] * n + next2s = [-1] * n + eposs = [0] * n + return lengths, links, next1s, next2s, eposs + + def _build(self, start2, stop2): + """Automaton builder""" + seq2 = self.seq2 + junk = self.junk + # Make Nodes + size = (stop2 - start2) + n_nodes = 4 * size // 3 + 1 # Maximum 25% overallocation + inc = size // 10 + 1 # Then, 10% increments + nodes = self._make_nodes(n_nodes) + lengths, links, next1s, next2s, eposs = nodes + nstates = 1 + # Loop + last_len = 0 + last = 0 + for j in range(start2, stop2): + el = seq2[j] + if el in junk: + last_len = 0 + last = 0 + continue + + if nstates == n_nodes: + for a, b in zip(nodes, self._make_nodes(inc)): + a.extend(b) + n_nodes += inc + + curr = nstates + nstates += 1 + last_len += 1 + # New Node + lengths[curr] = last_len + eposs[curr] = j + + p = last + px1 = next1s[p] + px2 = next2s[p] + cont = True + while 1: + if px2 == -1: + next1s[p] = el + next2s[p] = curr + elif px2 == -3: + if el not in px1: + px1[el] = curr + else: + break + else: + if el != px1: + next1s[p] = {px1: px2, el: curr} + next2s[p] = -3 + else: + break + if not p: + # p is root! + cont = False + break + p = links[p] + px1 = next1s[p] + px2 = next2s[p] + if cont: + if px2 == -3: + q = px1[el] + else: + q = px2 + p_len_p1 = lengths[p] + 1 + if p_len_p1 == lengths[q]: + links[curr] = q + else: + + if nstates == n_nodes: + for a, b in zip(nodes, self._make_nodes(inc)): + a.extend(b) + n_nodes += inc + + clone = nstates + nstates += 1 + # Clone + lengths[clone] = p_len_p1 + links[clone] = links[q] + qx2 = next2s[q] + if qx2 != -1: + qx1 = next1s[q] + if qx2 == -3: + next1s[clone] = qx1.copy() + next2s[clone] = -3 + else: + next1s[clone] = qx1 + next2s[clone] = qx2 + # Copy `eposs[q]` to ensure leftmost match in seq2 + eposs[clone] = eposs[q] + while 1: + if px2 == -3: + if px1.get(el) is q: + px1[el] = clone + else: + break + else: + if px1 == el and px2 == q: + next2s[p] = clone + else: + break + if not p: + # p is root! + break + p = links[p] + px1 = next1s[p] + px2 = next2s[p] + + links[q] = links[curr] = clone + + last = curr + + # Trim unused state space + if nstates < n_nodes: + for item in nodes: + del item[nstates:] + return nodes + + def _finditer(self, seq1, start1, stop1, best=False): + """Core scanning routine + Args: + best : bool + False - return all matches, including non-maximal + True - return all matches of maximum length + all these will naturally be maximal + Returns: + generator of tuples (e1, e2, k), where + e1, e2 are ending positions in seq1 and seq2 respectively + k is length of a match + Thus, starting position is: e1 + 1 - k + And stop for a slice is: e1 + 1 + """ + if best not in (0, 1): + raise ValueError(f'{best=} not in (0, 1)') + lengths, links, next1s, next2s, eposs = self.nodes + junk = self.junk + v = 0 + k = 0 + best_len = 0 + results = [] + root_x1 = vx1 = next1s[v] + root_x2 = vx2 = next2s[v] + start2, stop2 = self.cache + size1 = stop1 - start1 + size2 = stop2 - start2 + for i in range(start1, stop1): + el = seq1[i] + if el in junk: + v = 0 + k = 0 + vx1 = root_x1 + vx2 = root_x2 + continue + + while v and (vx2 == -1 or (el not in vx1 if vx2 == -3 else vx1 != el)): + v = links[v] + k = lengths[v] + vx1 = next1s[v] + vx2 = next2s[v] + + if vx2 == -3: + v_new = vx1.get(el) + elif vx2 == -1: + v_new = None + else: + v_new = vx2 if vx1 == el else None + if v_new is not None: + v = v_new + vx1 = next1s[v] + vx2 = next2s[v] + k += 1 + if not best: + yield (i, eposs[v], k) + else: + if k > best_len: + best_len = k + results = [(i, v, k)] + elif k == best_len: + results.append((i, v, k)) + if results: + for i, v, k in results: + yield (i, eposs[v], k) + + +######################################################################## +### GestaltSequenceMatcher +######################################################################## + + +class _Sentinel: + def __init__(self, name): + self.name = name + + def __repr__(self): + return self.name + + __reduce__ = None + + +# Private sentinels +_RANGE = _Sentinel('RANGE') # Range to process +_BLOCK = _Sentinel('BLOCK') # Block to return +_RANGEWITHBLOCK = _Sentinel('RANGEWITHBLOCK') # Range to process & pre-evaluated block + +# Modifier sentinels. These are returned as first tuple item from `_modifier` +REPLACEBLOCK = _Sentinel('REPLACEBLOCK') # Block replacement (all else continues BAU) +ANCHORBLOCKS = _Sentinel('ANCHORBLOCKS') # List of blocks (not subject to balancing) +RESULTBLOCKS = _Sentinel('RESULTBLOCKS') # List of blocks that terminate recursion + + +def _calc_skew(i, j, k, alo, ahi, blo, bhi): + """Difference in normalized positions of block mid-points + Returns skew : float, where -1 < skew < 1 + """ + k_div_2 = k / 2 + apos = (i + k_div_2 - alo) / (ahi - alo) + bpos = (j + k_div_2 - blo) / (bhi - blo) + return apos - bpos + + +class GestaltSequenceMatcher(SequenceMatcherBase): + """ + GestaltSequenceMatcher is a flexible class for comparing pairs + of sequences of any type, so long as the sequence elements are hashable. + + It builds upon the same idea as `SequenceMatcher` and with its defaults + its results are exactly the same as the ones of `SequenceMatcher` with + `autojunk` parameter set to False. + + However, while `SequenceMatcher` is able to obtain same result, + it is only practical to use with `autojunk` set to False due to + quadratic worst case complexity. + + `GestaltSequenceMatcher`, on the other hand, implements Suffix Automaton, + which has guaranteed O(n) complexity, making it possible to use exact + calculation on long sequences. + + Furthermore, `GestaltSequenceMatcher` has `balancing` parameter. + By default it is turned off, but it can be turned on to desired level to + reduce the chance of greedily committing to unbalanced matches. + It does so by sometimes selecting shorter matches by lookin 1 step ahead. + It produces more concise diffs with more lines matched, while retaining + block-oriented nature. + """ + + def __init__(self, isjunk=None, a='', b='', balancing=0): + """ + Args: + balancing : float in [0, 1] + a ratio that specifies the proportion of `skew` for which + balancing action will be attempted. + If 0, it is turned off and no balancing will take place. + + Balancing: + Balancing action will commence if abs(skew) >= 1 - balancing, + where -1 <= skew <= 1. + Recommended value is 2/3, which means that 2/3 of + worst possible skew values will be eligible for balancing. + Note for the future: balancing procedure scales to k-strings well + + Balancing in action: + balancing = 2/3 + seq1 = '-xx-yy-###-' + seq2 = '_###_xx_yy_' + + m1 = (7 + 10) / 2 = 8.5 + A | + -xx-yy-###- + _###_xx_yy_ + | B + m2 = (1 + 4) / 2 = 2.5 + + skew = 8.5 / 11 - 2.5 / 11 = 0.545 + do_balancing = abs(skew) > 1 - balancing = 0.545 > 0.333 = True + + Once it has been decided to do balancing, the procedure is: + 1. Select a set of alternative candidate blocks + To do so, we find longest substring for 2 ranges + that exclude matched block in one of the sequences: + + a) -xx-yy- + _###_xx_yy_ + + b) -xx-yy-###- + _xx_yy_ + + Thus the full candidate set is: + ### - initial longest block + xx - found in both ranges + 2. For each candidate find 2 additional blocks (on each side) + ### - has no nearby matches + xx - has another 'yy' on the right + 3. Select a candidate for which the sum of 3 block lengths is highest + ### - 3 (only the candidate) + xx - 4 (xx + yy) + + Thus, for this example, xx is picked. + + Comparison to SequenceMatcher: + In terms of results, the following 2 are equivalent: + a) SequenceMatcher(isjunk=None, autojunk=False) + b) GestaltSequenceMatcher(isjunk=None, balancing=0) + + Examples: + >>> seq1 = 'aaaa_aaaa_bbbbb' + >>> seq2 = 'bbbbb-aaaa-aaaa' + >>> m1 = GestaltSequenceMatcher(None, seq1, seq2) + >>> m2 = GestaltSequenceMatcher(None, seq1, seq2, balancing=2/3) + >>> list(map(tuple, m1.get_matching_blocks())) + [(10, 0, 5), (15, 15, 0)] + >>> list(map(tuple, m2.get_matching_blocks())) + [(0, 6, 4), (5, 11, 4), (15, 15, 0)] + """ + balancing = float(balancing) + if not 0 <= balancing <= 1: + msg = "'balancing' must be a float between 0 and 1 inclusive." + raise ValueError(msg) + self.balancing = balancing + super().__init__(isjunk, a, b) + + def _prepare_seq2(self): + b = self.b + self.bjunk = bjunk = set() + if self.isjunk: + bjunk.update(filter(self.isjunk, _Counter(b))) + self.automaton = _LCSUBAutomaton(b, junk=bjunk) + + def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None, *, quick_only=False): + """Find longest matching block in a[alo:ahi] and b[blo:bhi]. + By default it will find the longest match in the entirety of a and b. + + Look up docstring of SequenceMatcher.find_longest_match + for more information. + + The only difference is `quick_only` argument, which if set to True + might not return a value if not possible with current build + """ + a, b, bjunk = self.a, self.b, self.bjunk + automaton = self.automaton + func = automaton._try_find if quick_only else automaton.find + block = func(self.a, alo, ahi, blo, bhi) + if block is None: + # For quick_only=True it might not return anything + return + + if bjunk: + # Extend match to surrounding junk + # [2026-02-07@dgpb]: Note, expanding will happen even when no-match + block = _expand_block_to_junk( + bjunk, block, a, b, alo, ahi, blo, bhi, inverse=False) + + return Match._make(block) + + def batch_find_longest_match(self, bounds_list): + """Performance method for many `find_longest_match` calls + It calls `find` in order that aims to minimize builds needed + Also, does not evaluate same range twice + Args: + bounds_list : list[tuple[int, int, int, int]] + list of tuples: (alo, ahi, blo, bhi) + """ + a, b, bjunk = self.a, self.b, self.bjunk + bounds_list = list(bounds_list) + block_list = self.automaton.batchfind(a, bounds_list) + _make_match = Match._make + for block, bounds in zip(block_list, bounds_list): + if block[2] and bjunk: + block = _expand_block_to_junk(bjunk, block, a, b, *bounds) + yield _make_match(block) + + def _modifier(self, depth, block, alo, ahi, blo, bhi): + """An entry point for intercepting `_get_matching_blocks` algorithm + It is subject to be implemented by derived class. + It can be used for: + a) quick peak into what algorithm is doing + b) modification of divide-and-conquer algorithm + + Args: + depth : int + depth 1 is the initial one + block : tuple[start_in_1: int, start_in_b: int, length: int] + Candidate block for recursion loop. It is obtained by + calling find_longest_match for current recursion range + alo, ahi, blo, bhi : int, int, int, int + range of current recursion iteration + + This method returns None for no action. Otherwise, a tuple of 2 items: + 1. rtype : _Sentinel + 2. data : object + + rtype can take 3 sentinel values found in `difflib`. + It indicates what is the type of return and what it means: + + REPLACEBLOCK - Block replacement (all else continues BAU) + e.g. (REPLACEBLOCK, (0, 0, 10)) + + ANCHORBLOCKS - List of blocks (not subject to balancing) + All ranges around these blocks will are subject + to further recursion. + e.g. (ANCHORBLOCKS, [(0, 0, 10), (10, 10, 10)]) + + RESULTBLOCKS - List of blocks that terminate recursion + e.g. (RESULTBLOCKS, [(0, 0, 10), (10, 10, 10)]) + + If data contains no blocks or only blocks of 0 length, + the algorithm does not recurse further. + + Note, one can get `a`, `b`, `automaton`, etc from self + """ + pass + + def _get_matching_blocks(self): + balancing = self.balancing + alo, ahi, blo, bhi = 0, len(self.a), 0, len(self.b) + if alo >= ahi or blo >= bhi: + return + + skew_threshold = 1 - balancing + # 3-element tuples: (data_type, depth, data) + q = [(_RANGE, 1, (alo, ahi, blo, bhi))] + while q: + dtype, depth, data = q.pop() + + # 1. Decision logic for q items + if dtype is _BLOCK: + # Just a block to yield + yield data + continue + + if dtype is _RANGE: + # Just the range to process + bounds = data + i, j, k = init_block = self.find_longest_match(*bounds) + elif dtype is _RANGEWITHBLOCK: + # Range & pre-evaluated block + bounds, init_block = data + i, j, k = init_block + else: + msg = 'Unknown data type: {!r}' + raise RuntimeError(msg.format(dtype)) + + if not k: + continue + + tail_blocks = None + alo, ahi, blo, bhi = bounds + + # 2.1. Call modifier method + modifier_result = self._modifier(depth, init_block, *bounds) + if modifier_result is not None: + mtype, data = modifier_result + + # 2.1.1. Prepare for validation + if mtype is REPLACEBLOCK: + i, j, k = data + data = [(i, j, k)] + elif mtype is ANCHORBLOCKS or mtype is RESULTBLOCKS: + data = sorted(data) + else: + msg = 'Unknown `_modifier` return type: {!r}' + raise RuntimeError(msg.format(mtype)) + + # 2.1.2. Validate + validated = [] + i0, j0 = alo, blo + for i, j, k in data: + if not k: + continue + if not (i0 <= i <= i + k <= ahi) or not (j0 <= j <= j + k <= bhi): + msg = ( + '`_modifier` returned invalid block, which ' + 'is either out of bounds or overlaps with a nearby one' + 'block={}, while current interval is {}' + ) + raise RuntimeError(msg.format(data, bounds)) + validated.append((i, j, k)) + i0 = i + k + j0 = j + k + + if not validated: + continue + + # 2.1.3. Apply action + if mtype is REPLACEBLOCK: + i, j, k = init_block = validated[0] + + elif mtype is ANCHORBLOCKS: + tail_blocks = validated + + else: + # mtype is RESULTBLOCKS + yield from validated + continue + + # 2.2. Possibly take balancing action + if tail_blocks is None and balancing: + skew = _calc_skew(i, j, k, alo, ahi, blo, bhi) + if abs(skew) > skew_threshold: + i2 = i + k + j2 = j + k + + # 2.2.1. Select Candidates + jobs = [] + if skew >= 0: + # a: ---------####-- + # b: --####--------- + jobs.append((alo, i, blo, bhi)) + jobs.append((alo, ahi, j2, bhi)) + else: + # a: --####--------- + # b: ---------####-- + jobs.append((i2, ahi, blo, bhi)) + jobs.append((alo, ahi, blo, j)) + + candidates = {init_block} + for block in self.batch_find_longest_match(jobs): + if block[2]: + candidates.add(block) + + # 2.2.2. Evaluate nearby matches + triples = [] + jobs = set() + for block in candidates: + kk = block[2] + if kk: + ii = block[0] + jj = block[1] + lo_args = (alo, ii, blo, jj) + hi_args = (ii + kk, ahi, jj + kk, bhi) + jobs.add(lo_args) + jobs.add(hi_args) + triples.append([lo_args, block, hi_args]) + + block_list = self.batch_find_longest_match(jobs) + job_results = dict(zip(jobs, block_list)) + + # 2.2.3. Pick middle block of best tripple + for triple in triples: + triple[0] = job_results[triple[0]] + triple[2] = job_results[triple[2]] + # NOTE: k**1.3 is empirically tuned + # prefers one long match to many small ones + # but not too aggressively, so to be able to jump + # out of skewed positions + total = sum(t[2]**1.3 for t in triple) + skew = _calc_skew(*triple[1], *bounds) + triple.append((total, -abs(skew))) + best = max(triples, key=lambda x: x[-1]) + tail_blocks = best[1:2] + + # 2.3. Take the initial substring if nothing else has been set + if tail_blocks is None: + tail_blocks = [init_block] + + # 3.1. Interpolate `tail_blocks` with ranges + q_tail = [] + i0, j0 = alo, blo + for block in tail_blocks: + ii, jj, kk = block + if kk: + if i0 < ii and j0 < jj: + q_tail.append((_RANGE, (i0, ii, j0, jj))) + q_tail.append((_BLOCK, block)) + i0, j0 = ii + kk, jj + kk + if not q_tail: + # No blocks identified. Do not recurse further. + continue + q_tail.append((_RANGE, (i0, ahi, j0, bhi))) + + # 3.2. Yield what is possible straight away + q_tail.reverse() + while q_tail: + dtype, data = q_tail.pop() + if dtype is _BLOCK: + yield data + elif dtype is _RANGE: + q_tail.append((dtype, data)) + q_tail.reverse() + break + else: + msg = 'Unknown data type: {!r}' + raise RuntimeError(msg.format(dtype)) + + # 3.3. append to Q what is not + d = depth + 1 + while q_tail: + dtype, data = q_tail.pop() + if dtype is _BLOCK: + q.append((dtype, d, data)) + elif dtype is _RANGE: + # Try quick evaluation without re-building + # Before cache was overriden + _bounds = data + block = self.find_longest_match(*_bounds, quick_only=True) + if block is not None: + q.append((_RANGEWITHBLOCK, d, (_bounds, block))) + else: + q.append((dtype, d, data)) + else: + msg = 'Unknown data type: {!r}' + raise RuntimeError(msg.format(dtype)) diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py index 771fd46e042a41..031e92d64a14f4 100644 --- a/Lib/test/test_difflib.py +++ b/Lib/test/test_difflib.py @@ -640,6 +640,133 @@ def test_invalid_input(self): ''.join(difflib.restore([], 3)) +class TestLCSUBAutomaton(unittest.TestCase): + def test_find(self): + cases = [ + ('abd', 'abcabd', (0, 3, 3)), + ('dab', 'abcabd', (1, 0, 2)), + ] + collect = [] + for seq1, seq2, expect in cases: + result = difflib._LCSUBAutomaton(seq2).find(seq1) + self.assertEqual(result, expect) + collect.append(result) + + def test_find_with_junk(self): + cases = [ + ('ab_abd', 'abcabd', (3, 3, 3)), + ('abd_', 'ab_abd_', (0, 3, 3)), + ('abcbd', 'abc_bd', (0, 0, 3)), + ('cbd', 'abc_bd', (1, 4, 2)), + ] + for seq1, seq2, expect in cases: + result = difflib._LCSUBAutomaton(seq2, junk=('_')).find(seq1) + self.assertEqual(result, expect) + + def test_findall(self): + seq1 = 'defabc' + aut = difflib._LCSUBAutomaton('abcdef') + result = [seq1[i:i+k] for i, j, k in aut.findall(seq1)] + self.assertEqual(result, ['d', 'de', 'def', 'a', 'ab', 'abc']) + result = [seq1[i:i+k] for i, j, k in aut.findall(seq1, maximal=True)] + self.assertEqual(result, ['def', 'abc']) + result = [seq1[i:i+k] for i, j, k in aut.findall(seq1, mink=2)] + self.assertEqual(result, ['de', 'def', 'ab', 'abc']) + result = [seq1[i:i+k] for i, j, k in aut.findall(seq1, maxk=2)] + self.assertEqual(result, ['d', 'de', 'a', 'ab']) + + def test_batchfind(self): + seq1 = 'fgfedabacba' + seq2 = seq1[::-1] + n = len(seq1) + + intervals = [] + for i in range(n - 1): + for j in range(i + 1, min(i + 5, n)): + intervals.append((i, j)) + bounds_list = [] + for alo, ahi in intervals: + for blo, bhi in intervals: + bounds_list.append((alo, ahi, blo, bhi)) + + aut = difflib._LCSUBAutomaton(seq2) + results1 = [aut.find(seq1, *bounds) for bounds in bounds_list] + results2 = aut.batchfind(seq1, bounds_list) + self.assertEqual(results1, results2) + + +seq1_skew = """ +def foo1(a, b): + a += 1 + b += 1 + return a + b + +def foo2(a, b): + a += 2 + b += 2 + return a + b + +def foo3(a, b): + c = a + b + d = c + a * b + r = sum(range(d)) + return r +""" + + +seq2_skew = """ +def foo3(a, b): + c = a + b + d = c + a * b + r = sum(range(d)) + return r +# +def foo1(a, b): + a += 1 + b += 1 + return a + b +# +def foo2(a, b): + a += 2 + b += 2 + return a + b +""" + + +class TestGestaltSequenceMatcher(unittest.TestCase): + def test_cross_test_with_autojunk_false(self): + cases = [ + ("ABCDEFGHIJKLMNOP" * 50, "ACEGIKMOQBDFHJLNP" * 50), + ( + "".join(chr(ord('a') + i % 10) * (i + 1) for i in range(30)), + "".join(chr(ord('a') + i % 10) * (30 - i) for i in range(30)) + ), + ( + "A" + "X"*99 + "BCDEFGHIJKLMNOPQRSTUVWXYZ"*2, + "BCDEFGHIJKLMNOPQRSTUVWXYZ"*2 + "A" + "X"*99 + ) + ] + for seq1, seq2 in cases: + for isjunk in [None, lambda x: x in 'aeAE']: + sm1 = difflib.SequenceMatcher(isjunk, seq1, seq2, autojunk=False) + sm2 = difflib.GestaltSequenceMatcher(isjunk, seq1, seq2) + self.assertEqual(sm1.bjunk, sm2.bjunk) + blocks1 = sm1.get_matching_blocks() + blocks2 = sm2.get_matching_blocks() + self.assertEqual(blocks1, blocks2) + self.assertAlmostEqual(sm1.ratio(), sm2.ratio(), places=3) + + def test_balancing(self): + seq1 = seq1_skew.strip().splitlines() + seq2 = seq2_skew.strip().splitlines() + sm1 = difflib.GestaltSequenceMatcher(None, seq1, seq2) + sm2 = difflib.GestaltSequenceMatcher(None, seq1, seq2, balancing=2/3) + blocks1 = list(map(tuple, sm1.get_matching_blocks())) + blocks2 = list(map(tuple, sm2.get_matching_blocks())) + self.assertEqual(blocks1, [(10, 0, 5), (15, 15, 0)]) + self.assertEqual(blocks2, [(0, 6, 4), (5, 11, 4), (15, 15, 0)]) + + def setUpModule(): difflib.HtmlDiff._default_prefix = 0