ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/String-Similarity/fstrcmp.c
Revision: 1.1
Committed: Sat Jun 25 09:55:53 2005 UTC (18 years, 11 months ago) by root
Content type: text/plain
Branch: MAIN
Log Message:
*** empty log message ***

File Contents

# Content
1 /* Functions to make fuzzy comparisons between strings
2 Copyright (C) 1988, 1989, 1992, 1993, 1995 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or (at
7 your option) any later version.
8
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
18
19 Derived from GNU diff 2.7, analyze.c et al.
20
21 The basic algorithm is described in:
22 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
23 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
24 see especially section 4.2, which describes the variation used below.
25
26 The basic algorithm was independently discovered as described in:
27 "Algorithms for Approximate String Matching", E. Ukkonen,
28 Information and Control Vol. 64, 1985, pp. 100-118.
29
30 Modified to work on strings rather than files
31 by Peter Miller <pmiller@agso.gov.au>, October 1995
32
33 Modified to accept a "minimum similarity limit" to stop analyzing the
34 string when the similarity drops below the given limit by Marc Lehmann
35 <schmorp@schmorp.de>.
36
37 Modified to work on unicode (actually 31 bit are allowed) by Marc Lehmann
38 <schmorp@schmorp.de>.
39 */
40
41 #include <string.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <limits.h>
45
46 #include "fstrcmp.h"
47
48 /* moved here from fstrcmp.h to avoid problems on lose32 machines */
49 #define PARAMS(proto) proto
50 typedef UV CHAR;
51
52 /*
53 * Data on one input string being compared.
54 */
55 struct string_data
56 {
57 /* The string to be compared. */
58 const CHAR *data;
59
60 /* The length of the string to be compared. */
61 int data_length;
62
63 /* The number of characters inserted or deleted. */
64 int edit_count;
65 };
66
67 static struct string_data string[2];
68
69 static int max_edits; /* compareseq stops when edits > max_edits */
70
71 #ifdef MINUS_H_FLAG
72
73 /* This corresponds to the diff -H flag. With this heuristic, for
74 strings with a constant small density of changes, the algorithm is
75 linear in the strings size. This is unlikely in typical uses of
76 fstrcmp, and so is usually compiled out. Besides, there is no
77 interface to set it true. */
78 static int heuristic;
79
80 #endif
81
82
83 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
84 point furthest along the given diagonal in the forward search of the
85 edit matrix. */
86 static int *fdiag;
87
88 /* Vector, indexed by diagonal, containing the X coordinate of the point
89 furthest along the given diagonal in the backward search of the edit
90 matrix. */
91 static int *bdiag;
92
93 /* Edit scripts longer than this are too expensive to compute. */
94 static int too_expensive;
95
96 /* Snakes bigger than this are considered `big'. */
97 #define SNAKE_LIMIT 20
98
99 struct partition
100 {
101 /* Midpoints of this partition. */
102 int xmid, ymid;
103
104 /* Nonzero if low half will be analyzed minimally. */
105 int lo_minimal;
106
107 /* Likewise for high half. */
108 int hi_minimal;
109 };
110
111
112 /* NAME
113 diag - find diagonal path
114
115 SYNOPSIS
116 int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
117 struct partition *part);
118
119 DESCRIPTION
120 Find the midpoint of the shortest edit script for a specified
121 portion of the two strings.
122
123 Scan from the beginnings of the strings, and simultaneously from
124 the ends, doing a breadth-first search through the space of
125 edit-sequence. When the two searches meet, we have found the
126 midpoint of the shortest edit sequence.
127
128 If MINIMAL is nonzero, find the minimal edit script regardless
129 of expense. Otherwise, if the search is too expensive, use
130 heuristics to stop the search and report a suboptimal answer.
131
132 RETURNS
133 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
134 number XMID - YMID equals the number of inserted characters
135 minus the number of deleted characters (counting only characters
136 before the midpoint). Return the approximate edit cost; this is
137 the total number of characters inserted or deleted (counting
138 only characters before the midpoint), unless a heuristic is used
139 to terminate the search prematurely.
140
141 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
142 for the left half of the partition is known; similarly for
143 PART->RIGHT_MINIMAL.
144
145 CAVEAT
146 This function assumes that the first characters of the specified
147 portions of the two strings do not match, and likewise that the
148 last characters do not match. The caller must trim matching
149 characters from the beginning and end of the portions it is
150 going to specify.
151
152 If we return the "wrong" partitions, the worst this can do is
153 cause suboptimal diff output. It cannot cause incorrect diff
154 output. */
155
156 static int diag PARAMS ((int, int, int, int, int, struct partition *));
157
158 static int
159 diag (xoff, xlim, yoff, ylim, minimal, part)
160 int xoff;
161 int xlim;
162 int yoff;
163 int ylim;
164 int minimal;
165 struct partition *part;
166 {
167 int *const fd = fdiag; /* Give the compiler a chance. */
168 int *const bd = bdiag; /* Additional help for the compiler. */
169 const CHAR *const xv = string[0].data; /* Still more help for the compiler. */
170 const CHAR *const yv = string[1].data; /* And more and more . . . */
171 const int dmin = xoff - ylim; /* Minimum valid diagonal. */
172 const int dmax = xlim - yoff; /* Maximum valid diagonal. */
173 const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
174 const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
175 int fmin = fmid;
176 int fmax = fmid; /* Limits of top-down search. */
177 int bmin = bmid;
178 int bmax = bmid; /* Limits of bottom-up search. */
179 int c; /* Cost. */
180 int odd = (fmid - bmid) & 1;
181
182 /*
183 * True if southeast corner is on an odd diagonal with respect
184 * to the northwest.
185 */
186 fd[fmid] = xoff;
187 bd[bmid] = xlim;
188 for (c = 1;; ++c)
189 {
190 int d; /* Active diagonal. */
191 int big_snake;
192
193 big_snake = 0;
194 /* Extend the top-down search by an edit step in each diagonal. */
195 if (fmin > dmin)
196 fd[--fmin - 1] = -1;
197 else
198 ++fmin;
199 if (fmax < dmax)
200 fd[++fmax + 1] = -1;
201 else
202 --fmax;
203 for (d = fmax; d >= fmin; d -= 2)
204 {
205 int x;
206 int y;
207 int oldx;
208 int tlo;
209 int thi;
210
211 tlo = fd[d - 1],
212 thi = fd[d + 1];
213
214 if (tlo >= thi)
215 x = tlo + 1;
216 else
217 x = thi;
218 oldx = x;
219 y = x - d;
220 while (x < xlim && y < ylim && xv[x] == yv[y])
221 {
222 ++x;
223 ++y;
224 }
225 if (x - oldx > SNAKE_LIMIT)
226 big_snake = 1;
227 fd[d] = x;
228 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
229 {
230 part->xmid = x;
231 part->ymid = y;
232 part->lo_minimal = part->hi_minimal = 1;
233 return 2 * c - 1;
234 }
235 }
236 /* Similarly extend the bottom-up search. */
237 if (bmin > dmin)
238 bd[--bmin - 1] = INT_MAX;
239 else
240 ++bmin;
241 if (bmax < dmax)
242 bd[++bmax + 1] = INT_MAX;
243 else
244 --bmax;
245 for (d = bmax; d >= bmin; d -= 2)
246 {
247 int x;
248 int y;
249 int oldx;
250 int tlo;
251 int thi;
252
253 tlo = bd[d - 1],
254 thi = bd[d + 1];
255 if (tlo < thi)
256 x = tlo;
257 else
258 x = thi - 1;
259 oldx = x;
260 y = x - d;
261 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
262 {
263 --x;
264 --y;
265 }
266 if (oldx - x > SNAKE_LIMIT)
267 big_snake = 1;
268 bd[d] = x;
269 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
270 {
271 part->xmid = x;
272 part->ymid = y;
273 part->lo_minimal = part->hi_minimal = 1;
274 return 2 * c;
275 }
276 }
277
278 if (minimal)
279 continue;
280
281 #ifdef MINUS_H_FLAG
282 /* Heuristic: check occasionally for a diagonal that has made lots
283 of progress compared with the edit distance. If we have any
284 such, find the one that has made the most progress and return
285 it as if it had succeeded.
286
287 With this heuristic, for strings with a constant small density
288 of changes, the algorithm is linear in the strings size. */
289 if (c > 200 && big_snake && heuristic)
290 {
291 int best;
292
293 best = 0;
294 for (d = fmax; d >= fmin; d -= 2)
295 {
296 int dd;
297 int x;
298 int y;
299 int v;
300
301 dd = d - fmid;
302 x = fd[d];
303 y = x - d;
304 v = (x - xoff) * 2 - dd;
305
306 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
307 {
308 if
309 (
310 v > best
311 &&
312 xoff + SNAKE_LIMIT <= x
313 &&
314 x < xlim
315 &&
316 yoff + SNAKE_LIMIT <= y
317 &&
318 y < ylim
319 )
320 {
321 /* We have a good enough best diagonal; now insist
322 that it end with a significant snake. */
323 int k;
324
325 for (k = 1; xv[x - k] == yv[y - k]; k++)
326 {
327 if (k == SNAKE_LIMIT)
328 {
329 best = v;
330 part->xmid = x;
331 part->ymid = y;
332 break;
333 }
334 }
335 }
336 }
337 }
338 if (best > 0)
339 {
340 part->lo_minimal = 1;
341 part->hi_minimal = 0;
342 return 2 * c - 1;
343 }
344 best = 0;
345 for (d = bmax; d >= bmin; d -= 2)
346 {
347 int dd;
348 int x;
349 int y;
350 int v;
351
352 dd = d - bmid;
353 x = bd[d];
354 y = x - d;
355 v = (xlim - x) * 2 + dd;
356
357 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
358 {
359 if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
360 yoff < y && y <= ylim - SNAKE_LIMIT)
361 {
362 /* We have a good enough best diagonal; now insist
363 that it end with a significant snake. */
364 int k;
365
366 for (k = 0; xv[x + k] == yv[y + k]; k++)
367 {
368 if (k == SNAKE_LIMIT - 1)
369 {
370 best = v;
371 part->xmid = x;
372 part->ymid = y;
373 break;
374 }
375 }
376 }
377 }
378 }
379 if (best > 0)
380 {
381 part->lo_minimal = 0;
382 part->hi_minimal = 1;
383 return 2 * c - 1;
384 }
385 }
386 #endif /* MINUS_H_FLAG */
387
388 /* Heuristic: if we've gone well beyond the call of duty, give up
389 and report halfway between our best results so far. */
390 if (c >= too_expensive)
391 {
392 int fxybest;
393 int fxbest;
394 int bxybest;
395 int bxbest;
396
397 /* Pacify `gcc -Wall'. */
398 fxbest = 0;
399 bxbest = 0;
400
401 /* Find forward diagonal that maximizes X + Y. */
402 fxybest = -1;
403 for (d = fmax; d >= fmin; d -= 2)
404 {
405 int x;
406 int y;
407
408 x = fd[d] < xlim ? fd[d] : xlim;
409 y = x - d;
410
411 if (ylim < y)
412 {
413 x = ylim + d;
414 y = ylim;
415 }
416 if (fxybest < x + y)
417 {
418 fxybest = x + y;
419 fxbest = x;
420 }
421 }
422 /* Find backward diagonal that minimizes X + Y. */
423 bxybest = INT_MAX;
424 for (d = bmax; d >= bmin; d -= 2)
425 {
426 int x;
427 int y;
428
429 x = xoff > bd[d] ? xoff : bd[d];
430 y = x - d;
431
432 if (y < yoff)
433 {
434 x = yoff + d;
435 y = yoff;
436 }
437 if (x + y < bxybest)
438 {
439 bxybest = x + y;
440 bxbest = x;
441 }
442 }
443 /* Use the better of the two diagonals. */
444 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
445 {
446 part->xmid = fxbest;
447 part->ymid = fxybest - fxbest;
448 part->lo_minimal = 1;
449 part->hi_minimal = 0;
450 }
451 else
452 {
453 part->xmid = bxbest;
454 part->ymid = bxybest - bxbest;
455 part->lo_minimal = 0;
456 part->hi_minimal = 1;
457 }
458 return 2 * c - 1;
459 }
460 }
461 }
462
463
464 /* NAME
465 compareseq - find edit sequence
466
467 SYNOPSIS
468 void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
469
470 DESCRIPTION
471 Compare in detail contiguous subsequences of the two strings
472 which are known, as a whole, to match each other.
473
474 The subsequence of string 0 is [XOFF, XLIM) and likewise for
475 string 1.
476
477 Note that XLIM, YLIM are exclusive bounds. All character
478 numbers are origin-0.
479
480 If MINIMAL is nonzero, find a minimal difference no matter how
481 expensive it is. */
482
483 static void compareseq PARAMS ((int, int, int, int, int));
484
485 static void
486 compareseq (xoff, xlim, yoff, ylim, minimal)
487 int xoff;
488 int xlim;
489 int yoff;
490 int ylim;
491 int minimal;
492 {
493 const CHAR *const xv = string[0].data; /* Help the compiler. */
494 const CHAR *const yv = string[1].data;
495
496 if (string[1].edit_count + string[0].edit_count > max_edits)
497 return;
498
499 /* Slide down the bottom initial diagonal. */
500 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
501 {
502 ++xoff;
503 ++yoff;
504 }
505
506 /* Slide up the top initial diagonal. */
507 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
508 {
509 --xlim;
510 --ylim;
511 }
512
513 /* Handle simple cases. */
514 if (xoff == xlim)
515 {
516 while (yoff < ylim)
517 {
518 ++string[1].edit_count;
519 ++yoff;
520 }
521 }
522 else if (yoff == ylim)
523 {
524 while (xoff < xlim)
525 {
526 ++string[0].edit_count;
527 ++xoff;
528 }
529 }
530 else
531 {
532 int c;
533 struct partition part;
534
535 /* Find a point of correspondence in the middle of the strings. */
536 c = diag (xoff, xlim, yoff, ylim, minimal, &part);
537 if (c == 1)
538 {
539 #if 0
540 /* This should be impossible, because it implies that one of
541 the two subsequences is empty, and that case was handled
542 above without calling `diag'. Let's verify that this is
543 true. */
544 abort ();
545 #else
546 /* The two subsequences differ by a single insert or delete;
547 record it and we are done. */
548 if (part.xmid - part.ymid < xoff - yoff)
549 ++string[1].edit_count;
550 else
551 ++string[0].edit_count;
552 #endif
553 }
554 else
555 {
556 /* Use the partitions to split this problem into subproblems. */
557 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
558 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
559 }
560 }
561 }
562
563
564 /* NAME
565 fstrcmp - fuzzy string compare
566
567 SYNOPSIS
568 double fstrcmp(const CHAR *s1, int l1, const CHAR *s2, int l2, double);
569
570 DESCRIPTION
571 The fstrcmp function may be used to compare two string for
572 similarity. It is very useful in reducing "cascade" or
573 "secondary" errors in compilers or other situations where
574 symbol tables occur.
575
576 RETURNS
577 double; 0 if the strings are entirly dissimilar, 1 if the
578 strings are identical, and a number in between if they are
579 similar. */
580
581 double
582 fstrcmp (const CHAR *string1, int length1,
583 const CHAR *string2, int length2,
584 double minimum)
585 {
586 int i;
587
588 size_t fdiag_len;
589 static int *fdiag_buf;
590 static size_t fdiag_max;
591
592 /* set the info for each string. */
593 string[0].data = string1;
594 string[0].data_length = length1;
595 string[1].data = string2;
596 string[1].data_length = length2;
597
598 /* short-circuit obvious comparisons */
599 if (string[0].data_length == 0 && string[1].data_length == 0)
600 return 1.0;
601 if (string[0].data_length == 0 || string[1].data_length == 0)
602 return 0.0;
603
604 /* Set TOO_EXPENSIVE to be approximate square root of input size,
605 bounded below by 256. */
606 too_expensive = 1;
607 for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
608 too_expensive <<= 1;
609 if (too_expensive < 256)
610 too_expensive = 256;
611
612 /* Because fstrcmp is typically called multiple times, while scanning
613 symbol tables, etc, attempt to minimize the number of memory
614 allocations performed. Thus, we use a static buffer for the
615 diagonal vectors, and never free them. */
616 fdiag_len = string[0].data_length + string[1].data_length + 3;
617 if (fdiag_len > fdiag_max)
618 {
619 fdiag_max = fdiag_len;
620 fdiag_buf = realloc (fdiag_buf, fdiag_max * (2 * sizeof (int)));
621 }
622 fdiag = fdiag_buf + string[1].data_length + 1;
623 bdiag = fdiag + fdiag_len;
624
625 max_edits = 1 + (string[0].data_length + string[1].data_length) * (1. - minimum);
626
627 /* Now do the main comparison algorithm */
628 string[0].edit_count = 0;
629 string[1].edit_count = 0;
630 compareseq (0, string[0].data_length, 0, string[1].data_length, 0);
631
632 /* The result is
633 ((number of chars in common) / (average length of the strings)).
634 This is admittedly biased towards finding that the strings are
635 similar, however it does produce meaningful results. */
636 return ((double)
637 (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)
638 / (string[0].data_length + string[1].data_length));
639
640 }