| | 1 | package org.openstreetmap.josm.tools; |
| | 2 | |
| | 3 | //// Taken from http://www.bmsi.com/java/#diff |
| | 4 | |
| | 5 | // http://www.bmsi.com/java/DiffPrint.java could also be useful |
| | 6 | |
| | 7 | /* |
| | 8 | * $Log: Diff.java,v $ |
| | 9 | * Revision 1.7 2009/01/19 03:05:26 stuart |
| | 10 | * Fix StackOverflow bug with heuristic on reported by Jimmy Han. |
| | 11 | * |
| | 12 | * Revision 1.6 2003/03/06 22:51:32 stuart |
| | 13 | * Convert to CVS |
| | 14 | * |
| | 15 | * Revision 1.5 2002/07/19 19:14:40 stuart |
| | 16 | * fix reverseScript, make change ctor public, update docs |
| | 17 | * |
| | 18 | * Revision 1.4 2002/04/09 17:53:39 stuart |
| | 19 | * More flexible interface for diff() function. |
| | 20 | * |
| | 21 | * Revision 1.3 2000/03/03 21:58:03 stuart |
| | 22 | * move discard_confusing_lines and shift_boundaries to class file_data |
| | 23 | * |
| | 24 | * Revision 1.2 2000/03/02 16:37:38 stuart |
| | 25 | * Add GPL and copyright |
| | 26 | * |
| | 27 | */ |
| | 28 | |
| | 29 | import java.util.Hashtable; |
| | 30 | |
| | 31 | /** A class to compare vectors of objects. The result of comparison |
| | 32 | is a list of <code>change</code> objects which form an |
| | 33 | edit script. The objects compared are traditionally lines |
| | 34 | of text from two files. Comparison options such as "ignore |
| | 35 | whitespace" are implemented by modifying the <code>equals</code> |
| | 36 | and <code>hashcode</code> methods for the objects compared. |
| | 37 | <p> |
| | 38 | The basic algorithm is described in: </br> |
| | 39 | "An O(ND) Difference Algorithm and its Variations", Eugene Myers, |
| | 40 | Algorithmica Vol. 1 No. 2, 1986, p 251. |
| | 41 | <p> |
| | 42 | This class outputs different results from GNU diff 1.15 on some |
| | 43 | inputs. Our results are actually better (smaller change list, smaller |
| | 44 | total size of changes), but it would be nice to know why. Perhaps |
| | 45 | there is a memory overwrite bug in GNU diff 1.15. |
| | 46 | |
| | 47 | @author Stuart D. Gathman, translated from GNU diff 1.15 |
| | 48 | Copyright (C) 2000 Business Management Systems, Inc. |
| | 49 | <p> |
| | 50 | This program is free software; you can redistribute it and/or modify |
| | 51 | it under the terms of the GNU General Public License as published by |
| | 52 | the Free Software Foundation; either version 1, or (at your option) |
| | 53 | any later version. |
| | 54 | <p> |
| | 55 | This program is distributed in the hope that it will be useful, |
| | 56 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| | 57 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| | 58 | GNU General Public License for more details. |
| | 59 | <p> |
| | 60 | You should have received a copy of the <a href=COPYING.txt> |
| | 61 | GNU General Public License</a> |
| | 62 | along with this program; if not, write to the Free Software |
| | 63 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| | 64 | |
| | 65 | */ |
| | 66 | |
| | 67 | public class Diff { |
| | 68 | |
| | 69 | /** Prepare to find differences between two arrays. Each element of |
| | 70 | the arrays is translated to an "equivalence number" based on |
| | 71 | the result of <code>equals</code>. The original Object arrays |
| | 72 | are no longer needed for computing the differences. They will |
| | 73 | be needed again later to print the results of the comparison as |
| | 74 | an edit script, if desired. |
| | 75 | */ |
| | 76 | public Diff(Object[] a,Object[] b) { |
| | 77 | Hashtable h = new Hashtable(a.length + b.length); |
| | 78 | filevec[0] = new file_data(a,h); |
| | 79 | filevec[1] = new file_data(b,h); |
| | 80 | } |
| | 81 | |
| | 82 | /** 1 more than the maximum equivalence value used for this or its |
| | 83 | sibling file. */ |
| | 84 | private int equiv_max = 1; |
| | 85 | |
| | 86 | /** When set to true, the comparison uses a heuristic to speed it up. |
| | 87 | With this heuristic, for files with a constant small density |
| | 88 | of changes, the algorithm is linear in the file size. */ |
| | 89 | public boolean heuristic = false; |
| | 90 | |
| | 91 | /** When set to true, the algorithm returns a guarranteed minimal |
| | 92 | set of changes. This makes things slower, sometimes much slower. */ |
| | 93 | public boolean no_discards = false; |
| | 94 | |
| | 95 | private int[] xvec, yvec; /* Vectors being compared. */ |
| | 96 | private int[] fdiag; /* Vector, indexed by diagonal, containing |
| | 97 | the X coordinate of the point furthest |
| | 98 | along the given diagonal in the forward |
| | 99 | search of the edit matrix. */ |
| | 100 | private int[] bdiag; /* Vector, indexed by diagonal, containing |
| | 101 | the X coordinate of the point furthest |
| | 102 | along the given diagonal in the backward |
| | 103 | search of the edit matrix. */ |
| | 104 | private int fdiagoff, bdiagoff; |
| | 105 | private final file_data[] filevec = new file_data[2]; |
| | 106 | private int cost; |
| | 107 | /** Snakes bigger than this are considered "big". */ |
| | 108 | private static final int SNAKE_LIMIT = 20; |
| | 109 | |
| | 110 | /** Find the midpoint of the shortest edit script for a specified |
| | 111 | portion of the two files. |
| | 112 | |
| | 113 | We scan from the beginnings of the files, and simultaneously from the ends, |
| | 114 | doing a breadth-first search through the space of edit-sequence. |
| | 115 | When the two searches meet, we have found the midpoint of the shortest |
| | 116 | edit sequence. |
| | 117 | |
| | 118 | The value returned is the number of the diagonal on which the midpoint lies. |
| | 119 | The diagonal number equals the number of inserted lines minus the number |
| | 120 | of deleted lines (counting only lines before the midpoint). |
| | 121 | The edit cost is stored into COST; this is the total number of |
| | 122 | lines inserted or deleted (counting only lines before the midpoint). |
| | 123 | |
| | 124 | This function assumes that the first lines of the specified portions |
| | 125 | of the two files do not match, and likewise that the last lines do not |
| | 126 | match. The caller must trim matching lines from the beginning and end |
| | 127 | of the portions it is going to specify. |
| | 128 | |
| | 129 | Note that if we return the "wrong" diagonal value, or if |
| | 130 | the value of bdiag at that diagonal is "wrong", |
| | 131 | the worst this can do is cause suboptimal diff output. |
| | 132 | It cannot cause incorrect diff output. */ |
| | 133 | |
| | 134 | private int diag (int xoff, int xlim, int yoff, int ylim) { |
| | 135 | final int[] fd = fdiag; // Give the compiler a chance. |
| | 136 | final int[] bd = bdiag; // Additional help for the compiler. |
| | 137 | final int[] xv = xvec; // Still more help for the compiler. |
| | 138 | final int[] yv = yvec; // And more and more . . . |
| | 139 | final int dmin = xoff - ylim; // Minimum valid diagonal. |
| | 140 | final int dmax = xlim - yoff; // Maximum valid diagonal. |
| | 141 | final int fmid = xoff - yoff; // Center diagonal of top-down search. |
| | 142 | final int bmid = xlim - ylim; // Center diagonal of bottom-up search. |
| | 143 | int fmin = fmid, fmax = fmid; // Limits of top-down search. |
| | 144 | int bmin = bmid, bmax = bmid; // Limits of bottom-up search. |
| | 145 | /* True if southeast corner is on an odd |
| | 146 | diagonal with respect to the northwest. */ |
| | 147 | final boolean odd = (fmid - bmid & 1) != 0; |
| | 148 | |
| | 149 | fd[fdiagoff + fmid] = xoff; |
| | 150 | bd[bdiagoff + bmid] = xlim; |
| | 151 | |
| | 152 | for (int c = 1;; ++c) |
| | 153 | { |
| | 154 | int d; /* Active diagonal. */ |
| | 155 | boolean big_snake = false; |
| | 156 | |
| | 157 | /* Extend the top-down search by an edit step in each diagonal. */ |
| | 158 | if (fmin > dmin) { |
| | 159 | fd[fdiagoff + --fmin - 1] = -1; |
| | 160 | } else { |
| | 161 | ++fmin; |
| | 162 | } |
| | 163 | if (fmax < dmax) { |
| | 164 | fd[fdiagoff + ++fmax + 1] = -1; |
| | 165 | } else { |
| | 166 | --fmax; |
| | 167 | } |
| | 168 | for (d = fmax; d >= fmin; d -= 2) |
| | 169 | { |
| | 170 | int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1]; |
| | 171 | |
| | 172 | if (tlo >= thi) { |
| | 173 | x = tlo + 1; |
| | 174 | } else { |
| | 175 | x = thi; |
| | 176 | } |
| | 177 | oldx = x; |
| | 178 | y = x - d; |
| | 179 | while (x < xlim && y < ylim && xv[x] == yv[y]) { |
| | 180 | ++x; ++y; |
| | 181 | } |
| | 182 | if (x - oldx > SNAKE_LIMIT) { |
| | 183 | big_snake = true; |
| | 184 | } |
| | 185 | fd[fdiagoff + d] = x; |
| | 186 | if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) |
| | 187 | { |
| | 188 | cost = 2 * c - 1; |
| | 189 | return d; |
| | 190 | } |
| | 191 | } |
| | 192 | |
| | 193 | /* Similar extend the bottom-up search. */ |
| | 194 | if (bmin > dmin) { |
| | 195 | bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE; |
| | 196 | } else { |
| | 197 | ++bmin; |
| | 198 | } |
| | 199 | if (bmax < dmax) { |
| | 200 | bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE; |
| | 201 | } else { |
| | 202 | --bmax; |
| | 203 | } |
| | 204 | for (d = bmax; d >= bmin; d -= 2) |
| | 205 | { |
| | 206 | int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1]; |
| | 207 | |
| | 208 | if (tlo < thi) { |
| | 209 | x = tlo; |
| | 210 | } else { |
| | 211 | x = thi - 1; |
| | 212 | } |
| | 213 | oldx = x; |
| | 214 | y = x - d; |
| | 215 | while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) { |
| | 216 | --x; --y; |
| | 217 | } |
| | 218 | if (oldx - x > SNAKE_LIMIT) { |
| | 219 | big_snake = true; |
| | 220 | } |
| | 221 | bd[bdiagoff + d] = x; |
| | 222 | if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) |
| | 223 | { |
| | 224 | cost = 2 * c; |
| | 225 | return d; |
| | 226 | } |
| | 227 | } |
| | 228 | |
| | 229 | /* Heuristic: check occasionally for a diagonal that has made |
| | 230 | lots of progress compared with the edit distance. |
| | 231 | If we have any such, find the one that has made the most |
| | 232 | progress and return it as if it had succeeded. |
| | 233 | |
| | 234 | With this heuristic, for files with a constant small density |
| | 235 | of changes, the algorithm is linear in the file size. */ |
| | 236 | |
| | 237 | if (c > 200 && big_snake && heuristic) |
| | 238 | { |
| | 239 | int best = 0; |
| | 240 | int bestpos = -1; |
| | 241 | |
| | 242 | for (d = fmax; d >= fmin; d -= 2) |
| | 243 | { |
| | 244 | int dd = d - fmid; |
| | 245 | int x = fd[fdiagoff + d]; |
| | 246 | int y = x - d; |
| | 247 | int v = (x - xoff) * 2 - dd; |
| | 248 | if (v > 12 * (c + (dd < 0 ? -dd : dd))) |
| | 249 | { |
| | 250 | if (v > best |
| | 251 | && xoff + SNAKE_LIMIT <= x && x < xlim |
| | 252 | && yoff + SNAKE_LIMIT <= y && y < ylim) |
| | 253 | { |
| | 254 | /* We have a good enough best diagonal; |
| | 255 | now insist that it end with a significant snake. */ |
| | 256 | int k; |
| | 257 | |
| | 258 | for (k = 1; xvec[x - k] == yvec[y - k]; k++) |
| | 259 | if (k == SNAKE_LIMIT) |
| | 260 | { |
| | 261 | best = v; |
| | 262 | bestpos = d; |
| | 263 | break; |
| | 264 | } |
| | 265 | } |
| | 266 | } |
| | 267 | } |
| | 268 | if (best > 0) |
| | 269 | { |
| | 270 | cost = 2 * c - 1; |
| | 271 | return bestpos; |
| | 272 | } |
| | 273 | |
| | 274 | best = 0; |
| | 275 | for (d = bmax; d >= bmin; d -= 2) |
| | 276 | { |
| | 277 | int dd = d - bmid; |
| | 278 | int x = bd[bdiagoff + d]; |
| | 279 | int y = x - d; |
| | 280 | int v = (xlim - x) * 2 + dd; |
| | 281 | if (v > 12 * (c + (dd < 0 ? -dd : dd))) |
| | 282 | { |
| | 283 | if (v > best |
| | 284 | && xoff < x && x <= xlim - SNAKE_LIMIT |
| | 285 | && yoff < y && y <= ylim - SNAKE_LIMIT) |
| | 286 | { |
| | 287 | /* We have a good enough best diagonal; |
| | 288 | now insist that it end with a significant snake. */ |
| | 289 | int k; |
| | 290 | |
| | 291 | for (k = 0; xvec[x + k] == yvec[y + k]; k++) |
| | 292 | if (k == SNAKE_LIMIT) |
| | 293 | { |
| | 294 | best = v; |
| | 295 | bestpos = d; |
| | 296 | break; |
| | 297 | } |
| | 298 | } |
| | 299 | } |
| | 300 | } |
| | 301 | if (best > 0) |
| | 302 | { |
| | 303 | cost = 2 * c - 1; |
| | 304 | return bestpos; |
| | 305 | } |
| | 306 | } |
| | 307 | } |
| | 308 | } |
| | 309 | |
| | 310 | /** Compare in detail contiguous subsequences of the two files |
| | 311 | which are known, as a whole, to match each other. |
| | 312 | |
| | 313 | The results are recorded in the vectors filevec[N].changed_flag, by |
| | 314 | storing a 1 in the element for each line that is an insertion or deletion. |
| | 315 | |
| | 316 | The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. |
| | 317 | |
| | 318 | Note that XLIM, YLIM are exclusive bounds. |
| | 319 | All line numbers are origin-0 and discarded lines are not counted. */ |
| | 320 | |
| | 321 | private void compareseq (int xoff, int xlim, int yoff, int ylim) { |
| | 322 | /* Slide down the bottom initial diagonal. */ |
| | 323 | while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) { |
| | 324 | ++xoff; ++yoff; |
| | 325 | } |
| | 326 | /* Slide up the top initial diagonal. */ |
| | 327 | while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) { |
| | 328 | --xlim; --ylim; |
| | 329 | } |
| | 330 | |
| | 331 | /* Handle simple cases. */ |
| | 332 | if (xoff == xlim) { |
| | 333 | while (yoff < ylim) { |
| | 334 | filevec[1].changed_flag[1+filevec[1].realindexes[yoff++]] = true; |
| | 335 | } |
| | 336 | } else if (yoff == ylim) { |
| | 337 | while (xoff < xlim) { |
| | 338 | filevec[0].changed_flag[1+filevec[0].realindexes[xoff++]] = true; |
| | 339 | } |
| | 340 | } else |
| | 341 | { |
| | 342 | /* Find a point of correspondence in the middle of the files. */ |
| | 343 | |
| | 344 | int d = diag (xoff, xlim, yoff, ylim); |
| | 345 | int c = cost; |
| | 346 | int f = fdiag[fdiagoff + d]; |
| | 347 | int b = bdiag[bdiagoff + d]; |
| | 348 | |
| | 349 | if (c == 1) |
| | 350 | /* This should be impossible, because it implies that |
| | 351 | one of the two subsequences is empty, |
| | 352 | and that case was handled above without calling `diag'. |
| | 353 | Let's verify that this is true. */ |
| | 354 | throw new IllegalArgumentException("Empty subsequence"); |
| | 355 | else |
| | 356 | { |
| | 357 | /* Use that point to split this problem into two subproblems. */ |
| | 358 | compareseq (xoff, b, yoff, b - d); |
| | 359 | /* This used to use f instead of b, |
| | 360 | but that is incorrect! |
| | 361 | It is not necessarily the case that diagonal d |
| | 362 | has a snake from b to f. */ |
| | 363 | compareseq (b, xlim, b - d, ylim); |
| | 364 | } |
| | 365 | } |
| | 366 | } |
| | 367 | |
| | 368 | /** Discard lines from one file that have no matches in the other file. |
| | 369 | */ |
| | 370 | |
| | 371 | private void discard_confusing_lines() { |
| | 372 | filevec[0].discard_confusing_lines(filevec[1]); |
| | 373 | filevec[1].discard_confusing_lines(filevec[0]); |
| | 374 | } |
| | 375 | |
| | 376 | private boolean inhibit = false; |
| | 377 | |
| | 378 | /** Adjust inserts/deletes of blank lines to join changes |
| | 379 | as much as possible. |
| | 380 | */ |
| | 381 | |
| | 382 | private void shift_boundaries() { |
| | 383 | if (inhibit) |
| | 384 | return; |
| | 385 | filevec[0].shift_boundaries(filevec[1]); |
| | 386 | filevec[1].shift_boundaries(filevec[0]); |
| | 387 | } |
| | 388 | |
| | 389 | public interface ScriptBuilder { |
| | 390 | /** Scan the tables of which lines are inserted and deleted, |
| | 391 | producing an edit script. |
| | 392 | @param changed0 true for lines in first file which do not match 2nd |
| | 393 | @param len0 number of lines in first file |
| | 394 | @param changed1 true for lines in 2nd file which do not match 1st |
| | 395 | @param len1 number of lines in 2nd file |
| | 396 | @return a linked list of changes - or null |
| | 397 | */ |
| | 398 | public change build_script( |
| | 399 | boolean[] changed0,int len0, |
| | 400 | boolean[] changed1,int len1 |
| | 401 | ); |
| | 402 | } |
| | 403 | |
| | 404 | /** Scan the tables of which lines are inserted and deleted, |
| | 405 | producing an edit script in reverse order. */ |
| | 406 | |
| | 407 | static class ReverseScript implements ScriptBuilder { |
| | 408 | public change build_script( |
| | 409 | final boolean[] changed0,int len0, |
| | 410 | final boolean[] changed1,int len1) |
| | 411 | { |
| | 412 | change script = null; |
| | 413 | int i0 = 0, i1 = 0; |
| | 414 | while (i0 < len0 || i1 < len1) { |
| | 415 | if (changed0[1+i0] || changed1[1+i1]) { |
| | 416 | int line0 = i0, line1 = i1; |
| | 417 | |
| | 418 | /* Find # lines changed here in each file. */ |
| | 419 | while (changed0[1+i0]) { |
| | 420 | ++i0; |
| | 421 | } |
| | 422 | while (changed1[1+i1]) { |
| | 423 | ++i1; |
| | 424 | } |
| | 425 | |
| | 426 | /* Record this change. */ |
| | 427 | script = new change(line0, line1, i0 - line0, i1 - line1, script); |
| | 428 | } |
| | 429 | |
| | 430 | /* We have reached lines in the two files that match each other. */ |
| | 431 | i0++; i1++; |
| | 432 | } |
| | 433 | |
| | 434 | return script; |
| | 435 | } |
| | 436 | } |
| | 437 | |
| | 438 | static class ForwardScript implements ScriptBuilder { |
| | 439 | /** Scan the tables of which lines are inserted and deleted, |
| | 440 | producing an edit script in forward order. */ |
| | 441 | public change build_script( |
| | 442 | final boolean[] changed0,int len0, |
| | 443 | final boolean[] changed1,int len1) |
| | 444 | { |
| | 445 | change script = null; |
| | 446 | int i0 = len0, i1 = len1; |
| | 447 | |
| | 448 | while (i0 >= 0 || i1 >= 0) |
| | 449 | { |
| | 450 | if (changed0[i0] || changed1[i1]) |
| | 451 | { |
| | 452 | int line0 = i0, line1 = i1; |
| | 453 | |
| | 454 | /* Find # lines changed here in each file. */ |
| | 455 | while (changed0[i0]) { |
| | 456 | --i0; |
| | 457 | } |
| | 458 | while (changed1[i1]) { |
| | 459 | --i1; |
| | 460 | } |
| | 461 | |
| | 462 | /* Record this change. */ |
| | 463 | script = new change(i0, i1, line0 - i0, line1 - i1, script); |
| | 464 | } |
| | 465 | |
| | 466 | /* We have reached lines in the two files that match each other. */ |
| | 467 | i0--; i1--; |
| | 468 | } |
| | 469 | |
| | 470 | return script; |
| | 471 | } |
| | 472 | } |
| | 473 | |
| | 474 | /** Standard ScriptBuilders. */ |
| | 475 | public final static ScriptBuilder |
| | 476 | forwardScript = new ForwardScript(), |
| | 477 | reverseScript = new ReverseScript(); |
| | 478 | |
| | 479 | /* Report the differences of two files. DEPTH is the current directory |
| | 480 | depth. */ |
| | 481 | public final change diff_2(final boolean reverse) { |
| | 482 | return diff(reverse ? reverseScript : forwardScript); |
| | 483 | } |
| | 484 | |
| | 485 | /** Get the results of comparison as an edit script. The script |
| | 486 | is described by a list of changes. The standard ScriptBuilder |
| | 487 | implementations provide for forward and reverse edit scripts. |
| | 488 | Alternate implementations could, for instance, list common elements |
| | 489 | instead of differences. |
| | 490 | @param bld an object to build the script from change flags |
| | 491 | @return the head of a list of changes |
| | 492 | */ |
| | 493 | public change diff(final ScriptBuilder bld) { |
| | 494 | |
| | 495 | /* Some lines are obviously insertions or deletions |
| | 496 | because they don't match anything. Detect them now, |
| | 497 | and avoid even thinking about them in the main comparison algorithm. */ |
| | 498 | |
| | 499 | discard_confusing_lines (); |
| | 500 | |
| | 501 | /* Now do the main comparison algorithm, considering just the |
| | 502 | undiscarded lines. */ |
| | 503 | |
| | 504 | xvec = filevec[0].undiscarded; |
| | 505 | yvec = filevec[1].undiscarded; |
| | 506 | |
| | 507 | int diags = |
| | 508 | filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3; |
| | 509 | fdiag = new int[diags]; |
| | 510 | fdiagoff = filevec[1].nondiscarded_lines + 1; |
| | 511 | bdiag = new int[diags]; |
| | 512 | bdiagoff = filevec[1].nondiscarded_lines + 1; |
| | 513 | |
| | 514 | compareseq (0, filevec[0].nondiscarded_lines, |
| | 515 | 0, filevec[1].nondiscarded_lines); |
| | 516 | fdiag = null; |
| | 517 | bdiag = null; |
| | 518 | |
| | 519 | /* Modify the results slightly to make them prettier |
| | 520 | in cases where that can validly be done. */ |
| | 521 | |
| | 522 | shift_boundaries (); |
| | 523 | |
| | 524 | /* Get the results of comparison in the form of a chain |
| | 525 | of `struct change's -- an edit script. */ |
| | 526 | return bld.build_script( |
| | 527 | filevec[0].changed_flag, |
| | 528 | filevec[0].buffered_lines, |
| | 529 | filevec[1].changed_flag, |
| | 530 | filevec[1].buffered_lines |
| | 531 | ); |
| | 532 | |
| | 533 | } |
| | 534 | |
| | 535 | /** The result of comparison is an "edit script": a chain of change objects. |
| | 536 | Each change represents one place where some lines are deleted |
| | 537 | and some are inserted. |
| | 538 | |
| | 539 | LINE0 and LINE1 are the first affected lines in the two files (origin 0). |
| | 540 | DELETED is the number of lines deleted here from file 0. |
| | 541 | INSERTED is the number of lines inserted here in file 1. |
| | 542 | |
| | 543 | If DELETED is 0 then LINE0 is the number of the line before |
| | 544 | which the insertion was done; vice versa for INSERTED and LINE1. */ |
| | 545 | |
| | 546 | public static class change { |
| | 547 | /** Previous or next edit command. */ |
| | 548 | public change link; |
| | 549 | /** # lines of file 1 changed here. */ |
| | 550 | public final int inserted; |
| | 551 | /** # lines of file 0 changed here. */ |
| | 552 | public final int deleted; |
| | 553 | /** Line number of 1st deleted line. */ |
| | 554 | public final int line0; |
| | 555 | /** Line number of 1st inserted line. */ |
| | 556 | public final int line1; |
| | 557 | |
| | 558 | /** Cons an additional entry onto the front of an edit script OLD. |
| | 559 | LINE0 and LINE1 are the first affected lines in the two files (origin 0). |
| | 560 | DELETED is the number of lines deleted here from file 0. |
| | 561 | INSERTED is the number of lines inserted here in file 1. |
| | 562 | |
| | 563 | If DELETED is 0 then LINE0 is the number of the line before |
| | 564 | which the insertion was done; vice versa for INSERTED and LINE1. */ |
| | 565 | public change(int line0, int line1, int deleted, int inserted, change old) { |
| | 566 | this.line0 = line0; |
| | 567 | this.line1 = line1; |
| | 568 | this.inserted = inserted; |
| | 569 | this.deleted = deleted; |
| | 570 | this.link = old; |
| | 571 | //System.err.println(line0+","+line1+","+inserted+","+deleted); |
| | 572 | } |
| | 573 | } |
| | 574 | |
| | 575 | /** Data on one input file being compared. |
| | 576 | */ |
| | 577 | |
| | 578 | class file_data { |
| | 579 | |
| | 580 | /** Allocate changed array for the results of comparison. */ |
| | 581 | void clear() { |
| | 582 | /* Allocate a flag for each line of each file, saying whether that line |
| | 583 | is an insertion or deletion. |
| | 584 | Allocate an extra element, always zero, at each end of each vector. |
| | 585 | */ |
| | 586 | changed_flag = new boolean[buffered_lines + 2]; |
| | 587 | } |
| | 588 | |
| | 589 | /** Return equiv_count[I] as the number of lines in this file |
| | 590 | that fall in equivalence class I. |
| | 591 | @return the array of equivalence class counts. |
| | 592 | */ |
| | 593 | int[] equivCount() { |
| | 594 | int[] equiv_count = new int[equiv_max]; |
| | 595 | for (int i = 0; i < buffered_lines; ++i) { |
| | 596 | ++equiv_count[equivs[i]]; |
| | 597 | } |
| | 598 | return equiv_count; |
| | 599 | } |
| | 600 | |
| | 601 | /** Discard lines that have no matches in another file. |
| | 602 | |
| | 603 | A line which is discarded will not be considered by the actual |
| | 604 | comparison algorithm; it will be as if that line were not in the file. |
| | 605 | The file's `realindexes' table maps virtual line numbers |
| | 606 | (which don't count the discarded lines) into real line numbers; |
| | 607 | this is how the actual comparison algorithm produces results |
| | 608 | that are comprehensible when the discarded lines are counted. |
| | 609 | <p> |
| | 610 | When we discard a line, we also mark it as a deletion or insertion |
| | 611 | so that it will be printed in the output. |
| | 612 | @param f the other file |
| | 613 | */ |
| | 614 | void discard_confusing_lines(file_data f) { |
| | 615 | clear(); |
| | 616 | /* Set up table of which lines are going to be discarded. */ |
| | 617 | final byte[] discarded = discardable(f.equivCount()); |
| | 618 | |
| | 619 | /* Don't really discard the provisional lines except when they occur |
| | 620 | in a run of discardables, with nonprovisionals at the beginning |
| | 621 | and end. */ |
| | 622 | filterDiscards(discarded); |
| | 623 | |
| | 624 | /* Actually discard the lines. */ |
| | 625 | discard(discarded); |
| | 626 | } |
| | 627 | |
| | 628 | /** Mark to be discarded each line that matches no line of another file. |
| | 629 | If a line matches many lines, mark it as provisionally discardable. |
| | 630 | @see equivCount() |
| | 631 | @param counts The count of each equivalence number for the other file. |
| | 632 | @return 0=nondiscardable, 1=discardable or 2=provisionally discardable |
| | 633 | for each line |
| | 634 | */ |
| | 635 | |
| | 636 | private byte[] discardable(final int[] counts) { |
| | 637 | final int end = buffered_lines; |
| | 638 | final byte[] discards = new byte[end]; |
| | 639 | final int[] equivs = this.equivs; |
| | 640 | int many = 5; |
| | 641 | int tem = end / 64; |
| | 642 | |
| | 643 | /* Multiply MANY by approximate square root of number of lines. |
| | 644 | That is the threshold for provisionally discardable lines. */ |
| | 645 | while ((tem = tem >> 2) > 0) { |
| | 646 | many *= 2; |
| | 647 | } |
| | 648 | |
| | 649 | for (int i = 0; i < end; i++) |
| | 650 | { |
| | 651 | int nmatch; |
| | 652 | if (equivs[i] == 0) { |
| | 653 | continue; |
| | 654 | } |
| | 655 | nmatch = counts[equivs[i]]; |
| | 656 | if (nmatch == 0) { |
| | 657 | discards[i] = 1; |
| | 658 | } else if (nmatch > many) { |
| | 659 | discards[i] = 2; |
| | 660 | } |
| | 661 | } |
| | 662 | return discards; |
| | 663 | } |
| | 664 | |
| | 665 | /** Don't really discard the provisional lines except when they occur |
| | 666 | in a run of discardables, with nonprovisionals at the beginning |
| | 667 | and end. */ |
| | 668 | |
| | 669 | private void filterDiscards(final byte[] discards) { |
| | 670 | final int end = buffered_lines; |
| | 671 | |
| | 672 | for (int i = 0; i < end; i++) |
| | 673 | { |
| | 674 | /* Cancel provisional discards not in middle of run of discards. */ |
| | 675 | if (discards[i] == 2) { |
| | 676 | discards[i] = 0; |
| | 677 | } else if (discards[i] != 0) |
| | 678 | { |
| | 679 | /* We have found a nonprovisional discard. */ |
| | 680 | int j; |
| | 681 | int length; |
| | 682 | int provisional = 0; |
| | 683 | |
| | 684 | /* Find end of this run of discardable lines. |
| | 685 | Count how many are provisionally discardable. */ |
| | 686 | for (j = i; j < end; j++) |
| | 687 | { |
| | 688 | if (discards[j] == 0) { |
| | 689 | break; |
| | 690 | } |
| | 691 | if (discards[j] == 2) { |
| | 692 | ++provisional; |
| | 693 | } |
| | 694 | } |
| | 695 | |
| | 696 | /* Cancel provisional discards at end, and shrink the run. */ |
| | 697 | while (j > i && discards[j - 1] == 2) { |
| | 698 | discards[--j] = 0; --provisional; |
| | 699 | } |
| | 700 | |
| | 701 | /* Now we have the length of a run of discardable lines |
| | 702 | whose first and last are not provisional. */ |
| | 703 | length = j - i; |
| | 704 | |
| | 705 | /* If 1/4 of the lines in the run are provisional, |
| | 706 | cancel discarding of all provisional lines in the run. */ |
| | 707 | if (provisional * 4 > length) |
| | 708 | { |
| | 709 | while (j > i) |
| | 710 | if (discards[--j] == 2) { |
| | 711 | discards[j] = 0; |
| | 712 | } |
| | 713 | } |
| | 714 | else |
| | 715 | { |
| | 716 | int consec; |
| | 717 | int minimum = 1; |
| | 718 | int tem = length / 4; |
| | 719 | |
| | 720 | /* MINIMUM is approximate square root of LENGTH/4. |
| | 721 | A subrun of two or more provisionals can stand |
| | 722 | when LENGTH is at least 16. |
| | 723 | A subrun of 4 or more can stand when LENGTH >= 64. */ |
| | 724 | while ((tem = tem >> 2) > 0) { |
| | 725 | minimum *= 2; |
| | 726 | } |
| | 727 | minimum++; |
| | 728 | |
| | 729 | /* Cancel any subrun of MINIMUM or more provisionals |
| | 730 | within the larger run. */ |
| | 731 | for (j = 0, consec = 0; j < length; j++) |
| | 732 | if (discards[i + j] != 2) { |
| | 733 | consec = 0; |
| | 734 | } else if (minimum == ++consec) { |
| | 735 | /* Back up to start of subrun, to cancel it all. */ |
| | 736 | j -= consec; |
| | 737 | } else if (minimum < consec) { |
| | 738 | discards[i + j] = 0; |
| | 739 | } |
| | 740 | |
| | 741 | /* Scan from beginning of run |
| | 742 | until we find 3 or more nonprovisionals in a row |
| | 743 | or until the first nonprovisional at least 8 lines in. |
| | 744 | Until that point, cancel any provisionals. */ |
| | 745 | for (j = 0, consec = 0; j < length; j++) |
| | 746 | { |
| | 747 | if (j >= 8 && discards[i + j] == 1) { |
| | 748 | break; |
| | 749 | } |
| | 750 | if (discards[i + j] == 2) { |
| | 751 | consec = 0; discards[i + j] = 0; |
| | 752 | } |
| | 753 | else if (discards[i + j] == 0) { |
| | 754 | consec = 0; |
| | 755 | } else { |
| | 756 | consec++; |
| | 757 | } |
| | 758 | if (consec == 3) { |
| | 759 | break; |
| | 760 | } |
| | 761 | } |
| | 762 | |
| | 763 | /* I advances to the last line of the run. */ |
| | 764 | i += length - 1; |
| | 765 | |
| | 766 | /* Same thing, from end. */ |
| | 767 | for (j = 0, consec = 0; j < length; j++) |
| | 768 | { |
| | 769 | if (j >= 8 && discards[i - j] == 1) { |
| | 770 | break; |
| | 771 | } |
| | 772 | if (discards[i - j] == 2) { |
| | 773 | consec = 0; discards[i - j] = 0; |
| | 774 | } |
| | 775 | else if (discards[i - j] == 0) { |
| | 776 | consec = 0; |
| | 777 | } else { |
| | 778 | consec++; |
| | 779 | } |
| | 780 | if (consec == 3) { |
| | 781 | break; |
| | 782 | } |
| | 783 | } |
| | 784 | } |
| | 785 | } |
| | 786 | } |
| | 787 | } |
| | 788 | |
| | 789 | /** Actually discard the lines. |
| | 790 | @param discards flags lines to be discarded |
| | 791 | */ |
| | 792 | private void discard(final byte[] discards) { |
| | 793 | final int end = buffered_lines; |
| | 794 | int j = 0; |
| | 795 | for (int i = 0; i < end; ++i) |
| | 796 | if (no_discards || discards[i] == 0) |
| | 797 | { |
| | 798 | undiscarded[j] = equivs[i]; |
| | 799 | realindexes[j++] = i; |
| | 800 | } else { |
| | 801 | changed_flag[1+i] = true; |
| | 802 | } |
| | 803 | nondiscarded_lines = j; |
| | 804 | } |
| | 805 | |
| | 806 | file_data(Object[] data,Hashtable h) { |
| | 807 | buffered_lines = data.length; |
| | 808 | |
| | 809 | equivs = new int[buffered_lines]; |
| | 810 | undiscarded = new int[buffered_lines]; |
| | 811 | realindexes = new int[buffered_lines]; |
| | 812 | |
| | 813 | for (int i = 0; i < data.length; ++i) { |
| | 814 | Integer ir = (Integer)h.get(data[i]); |
| | 815 | if (ir == null) { |
| | 816 | h.put(data[i],new Integer(equivs[i] = equiv_max++)); |
| | 817 | } else { |
| | 818 | equivs[i] = ir.intValue(); |
| | 819 | } |
| | 820 | } |
| | 821 | } |
| | 822 | |
| | 823 | /** Adjust inserts/deletes of blank lines to join changes |
| | 824 | as much as possible. |
| | 825 | |
| | 826 | We do something when a run of changed lines include a blank |
| | 827 | line at one end and have an excluded blank line at the other. |
| | 828 | We are free to choose which blank line is included. |
| | 829 | `compareseq' always chooses the one at the beginning, |
| | 830 | but usually it is cleaner to consider the following blank line |
| | 831 | to be the "change". The only exception is if the preceding blank line |
| | 832 | would join this change to other changes. |
| | 833 | @param f the file being compared against |
| | 834 | */ |
| | 835 | |
| | 836 | void shift_boundaries(file_data f) { |
| | 837 | final boolean[] changed = changed_flag; |
| | 838 | final boolean[] other_changed = f.changed_flag; |
| | 839 | int i = 0; |
| | 840 | int j = 0; |
| | 841 | int i_end = buffered_lines; |
| | 842 | int preceding = -1; |
| | 843 | int other_preceding = -1; |
| | 844 | |
| | 845 | for (;;) |
| | 846 | { |
| | 847 | int start, end, other_start; |
| | 848 | |
| | 849 | /* Scan forwards to find beginning of another run of changes. |
| | 850 | Also keep track of the corresponding point in the other file. */ |
| | 851 | |
| | 852 | while (i < i_end && !changed[1+i]) |
| | 853 | { |
| | 854 | while (other_changed[1+j++]) { |
| | 855 | /* Non-corresponding lines in the other file |
| | 856 | will count as the preceding batch of changes. */ |
| | 857 | other_preceding = j; |
| | 858 | } |
| | 859 | i++; |
| | 860 | } |
| | 861 | |
| | 862 | if (i == i_end) { |
| | 863 | break; |
| | 864 | } |
| | 865 | |
| | 866 | start = i; |
| | 867 | other_start = j; |
| | 868 | |
| | 869 | for (;;) |
| | 870 | { |
| | 871 | /* Now find the end of this run of changes. */ |
| | 872 | |
| | 873 | while (i < i_end && changed[1+i]) { |
| | 874 | i++; |
| | 875 | } |
| | 876 | end = i; |
| | 877 | |
| | 878 | /* If the first changed line matches the following unchanged one, |
| | 879 | and this run does not follow right after a previous run, |
| | 880 | and there are no lines deleted from the other file here, |
| | 881 | then classify the first changed line as unchanged |
| | 882 | and the following line as changed in its place. */ |
| | 883 | |
| | 884 | /* You might ask, how could this run follow right after another? |
| | 885 | Only because the previous run was shifted here. */ |
| | 886 | |
| | 887 | if (end != i_end |
| | 888 | && equivs[start] == equivs[end] |
| | 889 | && !other_changed[1+j] |
| | 890 | && end != i_end |
| | 891 | && !((preceding >= 0 && start == preceding) |
| | 892 | || (other_preceding >= 0 |
| | 893 | && other_start == other_preceding))) |
| | 894 | { |
| | 895 | changed[1+end++] = true; |
| | 896 | changed[1+start++] = false; |
| | 897 | ++i; |
| | 898 | /* Since one line-that-matches is now before this run |
| | 899 | instead of after, we must advance in the other file |
| | 900 | to keep in synch. */ |
| | 901 | ++j; |
| | 902 | } else { |
| | 903 | break; |
| | 904 | } |
| | 905 | } |
| | 906 | |
| | 907 | preceding = i; |
| | 908 | other_preceding = j; |
| | 909 | } |
| | 910 | } |
| | 911 | |
| | 912 | /** Number of elements (lines) in this file. */ |
| | 913 | final int buffered_lines; |
| | 914 | |
| | 915 | /** Vector, indexed by line number, containing an equivalence code for |
| | 916 | each line. It is this vector that is actually compared with that |
| | 917 | of another file to generate differences. */ |
| | 918 | private final int[] equivs; |
| | 919 | |
| | 920 | /** Vector, like the previous one except that |
| | 921 | the elements for discarded lines have been squeezed out. */ |
| | 922 | final int[] undiscarded; |
| | 923 | |
| | 924 | /** Vector mapping virtual line numbers (not counting discarded lines) |
| | 925 | to real ones (counting those lines). Both are origin-0. */ |
| | 926 | final int[] realindexes; |
| | 927 | |
| | 928 | /** Total number of nondiscarded lines. */ |
| | 929 | int nondiscarded_lines; |
| | 930 | |
| | 931 | /** Array, indexed by real origin-1 line number, |
| | 932 | containing true for a line that is an insertion or a deletion. |
| | 933 | The results of comparison are stored here. */ |
| | 934 | boolean[] changed_flag; |
| | 935 | |
| | 936 | } |
| | 937 | } |