blob: 57ee3c5c40d494b01703f40aede0bc6591f03b21 [file] [log] [blame]
Brad Bishop6e60e8b2018-02-01 10:27:11 -05001'''
2Simple Diff for Python version 1.0
3
4Annotate two versions of a list with the values that have been
5changed between the versions, similar to unix's `diff` but with
6a dead-simple Python interface.
7
8(C) Paul Butler 2008-2012 <http://www.paulbutler.org/>
9May be used and distributed under the zlib/libpng license
10<http://www.opensource.org/licenses/zlib-license.php>
11'''
12
13__all__ = ['diff', 'string_diff', 'html_diff']
14__version__ = '1.0'
15
16
17def diff(old, new):
18 '''
19 Find the differences between two lists. Returns a list of pairs, where the
20 first value is in ['+','-','='] and represents an insertion, deletion, or
21 no change for that list. The second value of the pair is the list
22 of elements.
23
24 Params:
25 old the old list of immutable, comparable values (ie. a list
26 of strings)
27 new the new list of immutable, comparable values
28
29 Returns:
30 A list of pairs, with the first part of the pair being one of three
31 strings ('-', '+', '=') and the second part being a list of values from
32 the original old and/or new lists. The first part of the pair
33 corresponds to whether the list of values is a deletion, insertion, or
34 unchanged, respectively.
35
36 Examples:
37 >>> diff([1,2,3,4],[1,3,4])
38 [('=', [1]), ('-', [2]), ('=', [3, 4])]
39
40 >>> diff([1,2,3,4],[2,3,4,1])
41 [('-', [1]), ('=', [2, 3, 4]), ('+', [1])]
42
43 >>> diff('The quick brown fox jumps over the lazy dog'.split(),
44 ... 'The slow blue cheese drips over the lazy carrot'.split())
45 ... # doctest: +NORMALIZE_WHITESPACE
46 [('=', ['The']),
47 ('-', ['quick', 'brown', 'fox', 'jumps']),
48 ('+', ['slow', 'blue', 'cheese', 'drips']),
49 ('=', ['over', 'the', 'lazy']),
50 ('-', ['dog']),
51 ('+', ['carrot'])]
52
53 '''
54
55 # Create a map from old values to their indices
56 old_index_map = dict()
57 for i, val in enumerate(old):
58 old_index_map.setdefault(val,list()).append(i)
59
60 # Find the largest substring common to old and new.
61 # We use a dynamic programming approach here.
62 #
63 # We iterate over each value in the `new` list, calling the
64 # index `inew`. At each iteration, `overlap[i]` is the
65 # length of the largest suffix of `old[:i]` equal to a suffix
66 # of `new[:inew]` (or unset when `old[i]` != `new[inew]`).
67 #
68 # At each stage of iteration, the new `overlap` (called
69 # `_overlap` until the original `overlap` is no longer needed)
70 # is built from the old one.
71 #
72 # If the length of overlap exceeds the largest substring
73 # seen so far (`sub_length`), we update the largest substring
74 # to the overlapping strings.
75
76 overlap = dict()
77 # `sub_start_old` is the index of the beginning of the largest overlapping
78 # substring in the old list. `sub_start_new` is the index of the beginning
79 # of the same substring in the new list. `sub_length` is the length that
80 # overlaps in both.
81 # These track the largest overlapping substring seen so far, so naturally
82 # we start with a 0-length substring.
83 sub_start_old = 0
84 sub_start_new = 0
85 sub_length = 0
86
87 for inew, val in enumerate(new):
88 _overlap = dict()
89 for iold in old_index_map.get(val,list()):
90 # now we are considering all values of iold such that
91 # `old[iold] == new[inew]`.
92 _overlap[iold] = (iold and overlap.get(iold - 1, 0)) + 1
93 if(_overlap[iold] > sub_length):
94 # this is the largest substring seen so far, so store its
95 # indices
96 sub_length = _overlap[iold]
97 sub_start_old = iold - sub_length + 1
98 sub_start_new = inew - sub_length + 1
99 overlap = _overlap
100
101 if sub_length == 0:
102 # If no common substring is found, we return an insert and delete...
103 return (old and [('-', old)] or []) + (new and [('+', new)] or [])
104 else:
105 # ...otherwise, the common substring is unchanged and we recursively
106 # diff the text before and after that substring
107 return diff(old[ : sub_start_old], new[ : sub_start_new]) + \
108 [('=', new[sub_start_new : sub_start_new + sub_length])] + \
109 diff(old[sub_start_old + sub_length : ],
110 new[sub_start_new + sub_length : ])
111
112
113def string_diff(old, new):
114 '''
115 Returns the difference between the old and new strings when split on
116 whitespace. Considers punctuation a part of the word
117
118 This function is intended as an example; you'll probably want
119 a more sophisticated wrapper in practice.
120
121 Params:
122 old the old string
123 new the new string
124
125 Returns:
126 the output of `diff` on the two strings after splitting them
127 on whitespace (a list of change instructions; see the docstring
128 of `diff`)
129
130 Examples:
131 >>> string_diff('The quick brown fox', 'The fast blue fox')
132 ... # doctest: +NORMALIZE_WHITESPACE
133 [('=', ['The']),
134 ('-', ['quick', 'brown']),
135 ('+', ['fast', 'blue']),
136 ('=', ['fox'])]
137
138 '''
139 return diff(old.split(), new.split())
140
141
142def html_diff(old, new):
143 '''
144 Returns the difference between two strings (as in stringDiff) in
145 HTML format. HTML code in the strings is NOT escaped, so you
146 will get weird results if the strings contain HTML.
147
148 This function is intended as an example; you'll probably want
149 a more sophisticated wrapper in practice.
150
151 Params:
152 old the old string
153 new the new string
154
155 Returns:
156 the output of the diff expressed with HTML <ins> and <del>
157 tags.
158
159 Examples:
160 >>> html_diff('The quick brown fox', 'The fast blue fox')
161 'The <del>quick brown</del> <ins>fast blue</ins> fox'
162 '''
163 con = {'=': (lambda x: x),
164 '+': (lambda x: "<ins>" + x + "</ins>"),
165 '-': (lambda x: "<del>" + x + "</del>")}
166 return " ".join([(con[a])(" ".join(b)) for a, b in string_diff(old, new)])
167
168
169def check_diff(old, new):
170 '''
171 This tests that diffs returned by `diff` are valid. You probably won't
172 want to use this function, but it's provided for documentation and
173 testing.
174
175 A diff should satisfy the property that the old input is equal to the
176 elements of the result annotated with '-' or '=' concatenated together.
177 Likewise, the new input is equal to the elements of the result annotated
178 with '+' or '=' concatenated together. This function compares `old`,
179 `new`, and the results of `diff(old, new)` to ensure this is true.
180
181 Tests:
182 >>> check_diff('ABCBA', 'CBABA')
183 >>> check_diff('Foobarbaz', 'Foobarbaz')
184 >>> check_diff('Foobarbaz', 'Boobazbam')
185 >>> check_diff('The quick brown fox', 'Some quick brown car')
186 >>> check_diff('A thick red book', 'A quick blue book')
187 >>> check_diff('dafhjkdashfkhasfjsdafdasfsda', 'asdfaskjfhksahkfjsdha')
188 >>> check_diff('88288822828828288282828', '88288882882828282882828')
189 >>> check_diff('1234567890', '24689')
190 '''
191 old = list(old)
192 new = list(new)
193 result = diff(old, new)
194 _old = [val for (a, vals) in result if (a in '=-') for val in vals]
195 assert old == _old, 'Expected %s, got %s' % (old, _old)
196 _new = [val for (a, vals) in result if (a in '=+') for val in vals]
197 assert new == _new, 'Expected %s, got %s' % (new, _new)
198