ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/String-Similarity/fstrcmp.c
Revision: 1.2
Committed: Tue Nov 4 15:31:38 2008 UTC (15 years, 6 months ago) by root
Content type: text/plain
Branch: MAIN
CVS Tags: rel-1_04, HEAD
Changes since 1.1: +8 -10 lines
Log Message:
*** empty log message ***

File Contents

# Content
1 /* Functions to make fuzzy comparisons between strings
2 Copyright (C) 1988, 1989, 1992, 1993, 1995 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or (at
7 your option) any later version.
8
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
18
19 Derived from GNU diff 2.7, analyze.c et al.
20
21 The basic algorithm is described in:
22 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
23 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
24 see especially section 4.2, which describes the variation used below.
25
26 The basic algorithm was independently discovered as described in:
27 "Algorithms for Approximate String Matching", E. Ukkonen,
28 Information and Control Vol. 64, 1985, pp. 100-118.
29
30 Modified to work on strings rather than files
31 by Peter Miller <pmiller@agso.gov.au>, October 1995
32
33 Modified to accept a "minimum similarity limit" to stop analyzing the
34 string when the similarity drops below the given limit by Marc Lehmann
35 <schmorp@schmorp.de>.
36
37 Modified to work on unicode (actually 31 bit are allowed) by Marc Lehmann
38 <schmorp@schmorp.de>.
39 */
40
41 #include <string.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <limits.h>
45
46 #include "fstrcmp.h"
47
48 #define PARAMS(proto) proto
49
50 /*
51 * Data on one input string being compared.
52 */
53 struct string_data
54 {
55 /* The string to be compared. */
56 const UV *data;
57
58 /* The length of the string to be compared. */
59 int data_length;
60
61 /* The number of characters inserted or deleted. */
62 int edit_count;
63 };
64
65 static struct string_data string[2];
66
67 static int max_edits; /* compareseq stops when edits > max_edits */
68
69 #ifdef MINUS_H_FLAG
70
71 /* This corresponds to the diff -H flag. With this heuristic, for
72 strings with a constant small density of changes, the algorithm is
73 linear in the strings size. This is unlikely in typical uses of
74 fstrcmp, and so is usually compiled out. Besides, there is no
75 interface to set it true. */
76 static int heuristic;
77
78 #endif
79
80
81 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
82 point furthest along the given diagonal in the forward search of the
83 edit matrix. */
84 static int *fdiag;
85
86 /* Vector, indexed by diagonal, containing the X coordinate of the point
87 furthest along the given diagonal in the backward search of the edit
88 matrix. */
89 static int *bdiag;
90
91 /* Edit scripts longer than this are too expensive to compute. */
92 static int too_expensive;
93
94 /* Snakes bigger than this are considered `big'. */
95 #define SNAKE_LIMIT 20
96
97 struct partition
98 {
99 /* Midpoints of this partition. */
100 int xmid, ymid;
101
102 /* Nonzero if low half will be analyzed minimally. */
103 int lo_minimal;
104
105 /* Likewise for high half. */
106 int hi_minimal;
107 };
108
109
110 /* NAME
111 diag - find diagonal path
112
113 SYNOPSIS
114 int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
115 struct partition *part);
116
117 DESCRIPTION
118 Find the midpoint of the shortest edit script for a specified
119 portion of the two strings.
120
121 Scan from the beginnings of the strings, and simultaneously from
122 the ends, doing a breadth-first search through the space of
123 edit-sequence. When the two searches meet, we have found the
124 midpoint of the shortest edit sequence.
125
126 If MINIMAL is nonzero, find the minimal edit script regardless
127 of expense. Otherwise, if the search is too expensive, use
128 heuristics to stop the search and report a suboptimal answer.
129
130 RETURNS
131 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
132 number XMID - YMID equals the number of inserted characters
133 minus the number of deleted characters (counting only characters
134 before the midpoint). Return the approximate edit cost; this is
135 the total number of characters inserted or deleted (counting
136 only characters before the midpoint), unless a heuristic is used
137 to terminate the search prematurely.
138
139 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
140 for the left half of the partition is known; similarly for
141 PART->RIGHT_MINIMAL.
142
143 CAVEAT
144 This function assumes that the first characters of the specified
145 portions of the two strings do not match, and likewise that the
146 last characters do not match. The caller must trim matching
147 characters from the beginning and end of the portions it is
148 going to specify.
149
150 If we return the "wrong" partitions, the worst this can do is
151 cause suboptimal diff output. It cannot cause incorrect diff
152 output. */
153
154 static int diag PARAMS ((int, int, int, int, int, struct partition *));
155
156 static int
157 diag (xoff, xlim, yoff, ylim, minimal, part)
158 int xoff;
159 int xlim;
160 int yoff;
161 int ylim;
162 int minimal;
163 struct partition *part;
164 {
165 int *const fd = fdiag; /* Give the compiler a chance. */
166 int *const bd = bdiag; /* Additional help for the compiler. */
167 const UV *const xv = string[0].data; /* Still more help for the compiler. */
168 const UV *const yv = string[1].data; /* And more and more . . . */
169 const int dmin = xoff - ylim; /* Minimum valid diagonal. */
170 const int dmax = xlim - yoff; /* Maximum valid diagonal. */
171 const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
172 const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
173 int fmin = fmid;
174 int fmax = fmid; /* Limits of top-down search. */
175 int bmin = bmid;
176 int bmax = bmid; /* Limits of bottom-up search. */
177 int c; /* Cost. */
178 int odd = (fmid - bmid) & 1;
179
180 /*
181 * True if southeast corner is on an odd diagonal with respect
182 * to the northwest.
183 */
184 fd[fmid] = xoff;
185 bd[bmid] = xlim;
186 for (c = 1;; ++c)
187 {
188 int d; /* Active diagonal. */
189 int big_snake;
190
191 big_snake = 0;
192 /* Extend the top-down search by an edit step in each diagonal. */
193 if (fmin > dmin)
194 fd[--fmin - 1] = -1;
195 else
196 ++fmin;
197 if (fmax < dmax)
198 fd[++fmax + 1] = -1;
199 else
200 --fmax;
201 for (d = fmax; d >= fmin; d -= 2)
202 {
203 int x;
204 int y;
205 int oldx;
206 int tlo;
207 int thi;
208
209 tlo = fd[d - 1],
210 thi = fd[d + 1];
211
212 if (tlo >= thi)
213 x = tlo + 1;
214 else
215 x = thi;
216 oldx = x;
217 y = x - d;
218 while (x < xlim && y < ylim && xv[x] == yv[y])
219 {
220 ++x;
221 ++y;
222 }
223 if (x - oldx > SNAKE_LIMIT)
224 big_snake = 1;
225 fd[d] = x;
226 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
227 {
228 part->xmid = x;
229 part->ymid = y;
230 part->lo_minimal = part->hi_minimal = 1;
231 return 2 * c - 1;
232 }
233 }
234 /* Similarly extend the bottom-up search. */
235 if (bmin > dmin)
236 bd[--bmin - 1] = INT_MAX;
237 else
238 ++bmin;
239 if (bmax < dmax)
240 bd[++bmax + 1] = INT_MAX;
241 else
242 --bmax;
243 for (d = bmax; d >= bmin; d -= 2)
244 {
245 int x;
246 int y;
247 int oldx;
248 int tlo;
249 int thi;
250
251 tlo = bd[d - 1],
252 thi = bd[d + 1];
253 if (tlo < thi)
254 x = tlo;
255 else
256 x = thi - 1;
257 oldx = x;
258 y = x - d;
259 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
260 {
261 --x;
262 --y;
263 }
264 if (oldx - x > SNAKE_LIMIT)
265 big_snake = 1;
266 bd[d] = x;
267 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
268 {
269 part->xmid = x;
270 part->ymid = y;
271 part->lo_minimal = part->hi_minimal = 1;
272 return 2 * c;
273 }
274 }
275
276 if (minimal)
277 continue;
278
279 #ifdef MINUS_H_FLAG
280 /* Heuristic: check occasionally for a diagonal that has made lots
281 of progress compared with the edit distance. If we have any
282 such, find the one that has made the most progress and return
283 it as if it had succeeded.
284
285 With this heuristic, for strings with a constant small density
286 of changes, the algorithm is linear in the strings size. */
287 if (c > 200 && big_snake && heuristic)
288 {
289 int best;
290
291 best = 0;
292 for (d = fmax; d >= fmin; d -= 2)
293 {
294 int dd;
295 int x;
296 int y;
297 int v;
298
299 dd = d - fmid;
300 x = fd[d];
301 y = x - d;
302 v = (x - xoff) * 2 - dd;
303
304 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
305 {
306 if
307 (
308 v > best
309 &&
310 xoff + SNAKE_LIMIT <= x
311 &&
312 x < xlim
313 &&
314 yoff + SNAKE_LIMIT <= y
315 &&
316 y < ylim
317 )
318 {
319 /* We have a good enough best diagonal; now insist
320 that it end with a significant snake. */
321 int k;
322
323 for (k = 1; xv[x - k] == yv[y - k]; k++)
324 {
325 if (k == SNAKE_LIMIT)
326 {
327 best = v;
328 part->xmid = x;
329 part->ymid = y;
330 break;
331 }
332 }
333 }
334 }
335 }
336 if (best > 0)
337 {
338 part->lo_minimal = 1;
339 part->hi_minimal = 0;
340 return 2 * c - 1;
341 }
342 best = 0;
343 for (d = bmax; d >= bmin; d -= 2)
344 {
345 int dd;
346 int x;
347 int y;
348 int v;
349
350 dd = d - bmid;
351 x = bd[d];
352 y = x - d;
353 v = (xlim - x) * 2 + dd;
354
355 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
356 {
357 if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
358 yoff < y && y <= ylim - SNAKE_LIMIT)
359 {
360 /* We have a good enough best diagonal; now insist
361 that it end with a significant snake. */
362 int k;
363
364 for (k = 0; xv[x + k] == yv[y + k]; k++)
365 {
366 if (k == SNAKE_LIMIT - 1)
367 {
368 best = v;
369 part->xmid = x;
370 part->ymid = y;
371 break;
372 }
373 }
374 }
375 }
376 }
377 if (best > 0)
378 {
379 part->lo_minimal = 0;
380 part->hi_minimal = 1;
381 return 2 * c - 1;
382 }
383 }
384 #endif /* MINUS_H_FLAG */
385
386 /* Heuristic: if we've gone well beyond the call of duty, give up
387 and report halfway between our best results so far. */
388 if (c >= too_expensive)
389 {
390 int fxybest;
391 int fxbest;
392 int bxybest;
393 int bxbest;
394
395 /* Pacify `gcc -Wall'. */
396 fxbest = 0;
397 bxbest = 0;
398
399 /* Find forward diagonal that maximizes X + Y. */
400 fxybest = -1;
401 for (d = fmax; d >= fmin; d -= 2)
402 {
403 int x;
404 int y;
405
406 x = fd[d] < xlim ? fd[d] : xlim;
407 y = x - d;
408
409 if (ylim < y)
410 {
411 x = ylim + d;
412 y = ylim;
413 }
414 if (fxybest < x + y)
415 {
416 fxybest = x + y;
417 fxbest = x;
418 }
419 }
420 /* Find backward diagonal that minimizes X + Y. */
421 bxybest = INT_MAX;
422 for (d = bmax; d >= bmin; d -= 2)
423 {
424 int x;
425 int y;
426
427 x = xoff > bd[d] ? xoff : bd[d];
428 y = x - d;
429
430 if (y < yoff)
431 {
432 x = yoff + d;
433 y = yoff;
434 }
435 if (x + y < bxybest)
436 {
437 bxybest = x + y;
438 bxbest = x;
439 }
440 }
441 /* Use the better of the two diagonals. */
442 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
443 {
444 part->xmid = fxbest;
445 part->ymid = fxybest - fxbest;
446 part->lo_minimal = 1;
447 part->hi_minimal = 0;
448 }
449 else
450 {
451 part->xmid = bxbest;
452 part->ymid = bxybest - bxbest;
453 part->lo_minimal = 0;
454 part->hi_minimal = 1;
455 }
456 return 2 * c - 1;
457 }
458 }
459 }
460
461
462 /* NAME
463 compareseq - find edit sequence
464
465 SYNOPSIS
466 void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
467
468 DESCRIPTION
469 Compare in detail contiguous subsequences of the two strings
470 which are known, as a whole, to match each other.
471
472 The subsequence of string 0 is [XOFF, XLIM) and likewise for
473 string 1.
474
475 Note that XLIM, YLIM are exclusive bounds. All character
476 numbers are origin-0.
477
478 If MINIMAL is nonzero, find a minimal difference no matter how
479 expensive it is. */
480
481 static void compareseq PARAMS ((int, int, int, int, int));
482
483 static void
484 compareseq (xoff, xlim, yoff, ylim, minimal)
485 int xoff;
486 int xlim;
487 int yoff;
488 int ylim;
489 int minimal;
490 {
491 const UV *const xv = string[0].data; /* Help the compiler. */
492 const UV *const yv = string[1].data;
493
494 if (string[1].edit_count + string[0].edit_count > max_edits)
495 return;
496
497 /* Slide down the bottom initial diagonal. */
498 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
499 {
500 ++xoff;
501 ++yoff;
502 }
503
504 /* Slide up the top initial diagonal. */
505 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
506 {
507 --xlim;
508 --ylim;
509 }
510
511 /* Handle simple cases. */
512 if (xoff == xlim)
513 {
514 while (yoff < ylim)
515 {
516 ++string[1].edit_count;
517 ++yoff;
518 }
519 }
520 else if (yoff == ylim)
521 {
522 while (xoff < xlim)
523 {
524 ++string[0].edit_count;
525 ++xoff;
526 }
527 }
528 else
529 {
530 int c;
531 struct partition part;
532
533 /* Find a point of correspondence in the middle of the strings. */
534 c = diag (xoff, xlim, yoff, ylim, minimal, &part);
535 if (c == 1)
536 {
537 #if 0
538 /* This should be impossible, because it implies that one of
539 the two subsequences is empty, and that case was handled
540 above without calling `diag'. Let's verify that this is
541 true. */
542 abort ();
543 #else
544 /* The two subsequences differ by a single insert or delete;
545 record it and we are done. */
546 if (part.xmid - part.ymid < xoff - yoff)
547 ++string[1].edit_count;
548 else
549 ++string[0].edit_count;
550 #endif
551 }
552 else
553 {
554 /* Use the partitions to split this problem into subproblems. */
555 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
556 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
557 }
558 }
559 }
560
561
562 /* NAME
563 fstrcmp - fuzzy string compare
564
565 SYNOPSIS
566 double fstrcmp(const ChaR *s1, int l1, const UV *s2, int l2, double);
567
568 DESCRIPTION
569 The fstrcmp function may be used to compare two string for
570 similarity. It is very useful in reducing "cascade" or
571 "secondary" errors in compilers or other situations where
572 symbol tables occur.
573
574 RETURNS
575 double; 0 if the strings are entirly dissimilar, 1 if the
576 strings are identical, and a number in between if they are
577 similar. */
578
579 double
580 fstrcmp (const UV *string1, int length1,
581 const UV *string2, int length2,
582 double minimum)
583 {
584 int i;
585
586 size_t fdiag_len;
587 static int *fdiag_buf;
588 static size_t fdiag_max;
589
590 /* set the info for each string. */
591 string[0].data = string1;
592 string[0].data_length = length1;
593 string[1].data = string2;
594 string[1].data_length = length2;
595
596 /* short-circuit obvious comparisons */
597 if (string[0].data_length == 0 && string[1].data_length == 0)
598 return 1.0;
599 if (string[0].data_length == 0 || string[1].data_length == 0)
600 return 0.0;
601
602 /* Set TOO_EXPENSIVE to be approximate square root of input size,
603 bounded below by 256. */
604 too_expensive = 1;
605 for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
606 too_expensive <<= 1;
607 if (too_expensive < 256)
608 too_expensive = 256;
609
610 /* Because fstrcmp is typically called multiple times, while scanning
611 symbol tables, etc, attempt to minimize the number of memory
612 allocations performed. Thus, we use a static buffer for the
613 diagonal vectors, and never free them. */
614 fdiag_len = string[0].data_length + string[1].data_length + 3;
615 if (fdiag_len > fdiag_max)
616 {
617 fdiag_max = fdiag_len;
618 fdiag_buf = realloc (fdiag_buf, fdiag_max * (2 * sizeof (int)));
619 }
620 fdiag = fdiag_buf + string[1].data_length + 1;
621 bdiag = fdiag + fdiag_len;
622
623 max_edits = 1 + (string[0].data_length + string[1].data_length) * (1. - minimum);
624
625 /* Now do the main comparison algorithm */
626 string[0].edit_count = 0;
627 string[1].edit_count = 0;
628 compareseq (0, string[0].data_length, 0, string[1].data_length, 0);
629
630 /* The result is
631 ((number of chars in common) / (average length of the strings)).
632 This is admittedly biased towards finding that the strings are
633 similar, however it does produce meaningful results. */
634 return ((double)
635 (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)
636 / (string[0].data_length + string[1].data_length));
637
638 }