ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/String-Similarity/fstrcmp.c
Revision: 1.2
Committed: Tue Nov 4 15:31:38 2008 UTC (15 years, 7 months ago) by root
Content type: text/plain
Branch: MAIN
CVS Tags: rel-1_04, HEAD
Changes since 1.1: +8 -10 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 /* Functions to make fuzzy comparisons between strings
2     Copyright (C) 1988, 1989, 1992, 1993, 1995 Free Software Foundation, Inc.
3    
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or (at
7     your option) any later version.
8    
9     This program is distributed in the hope that it will be useful, but
10     WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12     General Public License for more details.
13    
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17    
18    
19     Derived from GNU diff 2.7, analyze.c et al.
20    
21     The basic algorithm is described in:
22     "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
23     Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
24     see especially section 4.2, which describes the variation used below.
25    
26     The basic algorithm was independently discovered as described in:
27     "Algorithms for Approximate String Matching", E. Ukkonen,
28     Information and Control Vol. 64, 1985, pp. 100-118.
29    
30     Modified to work on strings rather than files
31     by Peter Miller <pmiller@agso.gov.au>, October 1995
32    
33     Modified to accept a "minimum similarity limit" to stop analyzing the
34     string when the similarity drops below the given limit by Marc Lehmann
35     <schmorp@schmorp.de>.
36    
37     Modified to work on unicode (actually 31 bit are allowed) by Marc Lehmann
38     <schmorp@schmorp.de>.
39     */
40    
41     #include <string.h>
42     #include <stdio.h>
43     #include <stdlib.h>
44     #include <limits.h>
45    
46     #include "fstrcmp.h"
47    
48     #define PARAMS(proto) proto
49    
50     /*
51     * Data on one input string being compared.
52     */
53     struct string_data
54     {
55     /* The string to be compared. */
56 root 1.2 const UV *data;
57 root 1.1
58     /* The length of the string to be compared. */
59     int data_length;
60    
61     /* The number of characters inserted or deleted. */
62     int edit_count;
63     };
64    
65     static struct string_data string[2];
66    
67     static int max_edits; /* compareseq stops when edits > max_edits */
68    
69     #ifdef MINUS_H_FLAG
70    
71     /* This corresponds to the diff -H flag. With this heuristic, for
72     strings with a constant small density of changes, the algorithm is
73     linear in the strings size. This is unlikely in typical uses of
74     fstrcmp, and so is usually compiled out. Besides, there is no
75     interface to set it true. */
76     static int heuristic;
77    
78     #endif
79    
80    
81     /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
82     point furthest along the given diagonal in the forward search of the
83     edit matrix. */
84     static int *fdiag;
85    
86     /* Vector, indexed by diagonal, containing the X coordinate of the point
87     furthest along the given diagonal in the backward search of the edit
88     matrix. */
89     static int *bdiag;
90    
91     /* Edit scripts longer than this are too expensive to compute. */
92     static int too_expensive;
93    
94     /* Snakes bigger than this are considered `big'. */
95     #define SNAKE_LIMIT 20
96    
97     struct partition
98     {
99     /* Midpoints of this partition. */
100     int xmid, ymid;
101    
102     /* Nonzero if low half will be analyzed minimally. */
103     int lo_minimal;
104    
105     /* Likewise for high half. */
106     int hi_minimal;
107     };
108    
109    
110     /* NAME
111     diag - find diagonal path
112    
113     SYNOPSIS
114     int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
115     struct partition *part);
116    
117     DESCRIPTION
118     Find the midpoint of the shortest edit script for a specified
119     portion of the two strings.
120    
121     Scan from the beginnings of the strings, and simultaneously from
122     the ends, doing a breadth-first search through the space of
123     edit-sequence. When the two searches meet, we have found the
124     midpoint of the shortest edit sequence.
125    
126     If MINIMAL is nonzero, find the minimal edit script regardless
127     of expense. Otherwise, if the search is too expensive, use
128     heuristics to stop the search and report a suboptimal answer.
129    
130     RETURNS
131     Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
132     number XMID - YMID equals the number of inserted characters
133     minus the number of deleted characters (counting only characters
134     before the midpoint). Return the approximate edit cost; this is
135     the total number of characters inserted or deleted (counting
136     only characters before the midpoint), unless a heuristic is used
137     to terminate the search prematurely.
138    
139     Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
140     for the left half of the partition is known; similarly for
141     PART->RIGHT_MINIMAL.
142    
143     CAVEAT
144     This function assumes that the first characters of the specified
145     portions of the two strings do not match, and likewise that the
146     last characters do not match. The caller must trim matching
147     characters from the beginning and end of the portions it is
148     going to specify.
149    
150     If we return the "wrong" partitions, the worst this can do is
151     cause suboptimal diff output. It cannot cause incorrect diff
152     output. */
153    
154     static int diag PARAMS ((int, int, int, int, int, struct partition *));
155    
156     static int
157     diag (xoff, xlim, yoff, ylim, minimal, part)
158     int xoff;
159     int xlim;
160     int yoff;
161     int ylim;
162     int minimal;
163     struct partition *part;
164     {
165     int *const fd = fdiag; /* Give the compiler a chance. */
166     int *const bd = bdiag; /* Additional help for the compiler. */
167 root 1.2 const UV *const xv = string[0].data; /* Still more help for the compiler. */
168     const UV *const yv = string[1].data; /* And more and more . . . */
169 root 1.1 const int dmin = xoff - ylim; /* Minimum valid diagonal. */
170     const int dmax = xlim - yoff; /* Maximum valid diagonal. */
171     const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
172     const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
173     int fmin = fmid;
174     int fmax = fmid; /* Limits of top-down search. */
175     int bmin = bmid;
176     int bmax = bmid; /* Limits of bottom-up search. */
177     int c; /* Cost. */
178     int odd = (fmid - bmid) & 1;
179    
180     /*
181     * True if southeast corner is on an odd diagonal with respect
182     * to the northwest.
183     */
184     fd[fmid] = xoff;
185     bd[bmid] = xlim;
186     for (c = 1;; ++c)
187     {
188     int d; /* Active diagonal. */
189     int big_snake;
190    
191     big_snake = 0;
192     /* Extend the top-down search by an edit step in each diagonal. */
193     if (fmin > dmin)
194     fd[--fmin - 1] = -1;
195     else
196     ++fmin;
197     if (fmax < dmax)
198     fd[++fmax + 1] = -1;
199     else
200     --fmax;
201     for (d = fmax; d >= fmin; d -= 2)
202     {
203     int x;
204     int y;
205     int oldx;
206     int tlo;
207     int thi;
208    
209     tlo = fd[d - 1],
210     thi = fd[d + 1];
211    
212     if (tlo >= thi)
213     x = tlo + 1;
214     else
215     x = thi;
216     oldx = x;
217     y = x - d;
218     while (x < xlim && y < ylim && xv[x] == yv[y])
219     {
220     ++x;
221     ++y;
222     }
223     if (x - oldx > SNAKE_LIMIT)
224     big_snake = 1;
225     fd[d] = x;
226     if (odd && bmin <= d && d <= bmax && bd[d] <= x)
227     {
228     part->xmid = x;
229     part->ymid = y;
230     part->lo_minimal = part->hi_minimal = 1;
231     return 2 * c - 1;
232     }
233     }
234     /* Similarly extend the bottom-up search. */
235     if (bmin > dmin)
236     bd[--bmin - 1] = INT_MAX;
237     else
238     ++bmin;
239     if (bmax < dmax)
240     bd[++bmax + 1] = INT_MAX;
241     else
242     --bmax;
243     for (d = bmax; d >= bmin; d -= 2)
244     {
245     int x;
246     int y;
247     int oldx;
248     int tlo;
249     int thi;
250    
251     tlo = bd[d - 1],
252     thi = bd[d + 1];
253     if (tlo < thi)
254     x = tlo;
255     else
256     x = thi - 1;
257     oldx = x;
258     y = x - d;
259     while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
260     {
261     --x;
262     --y;
263     }
264     if (oldx - x > SNAKE_LIMIT)
265     big_snake = 1;
266     bd[d] = x;
267     if (!odd && fmin <= d && d <= fmax && x <= fd[d])
268     {
269     part->xmid = x;
270     part->ymid = y;
271     part->lo_minimal = part->hi_minimal = 1;
272     return 2 * c;
273     }
274     }
275    
276     if (minimal)
277     continue;
278    
279     #ifdef MINUS_H_FLAG
280     /* Heuristic: check occasionally for a diagonal that has made lots
281     of progress compared with the edit distance. If we have any
282     such, find the one that has made the most progress and return
283     it as if it had succeeded.
284    
285     With this heuristic, for strings with a constant small density
286     of changes, the algorithm is linear in the strings size. */
287     if (c > 200 && big_snake && heuristic)
288     {
289     int best;
290    
291     best = 0;
292     for (d = fmax; d >= fmin; d -= 2)
293     {
294     int dd;
295     int x;
296     int y;
297     int v;
298    
299     dd = d - fmid;
300     x = fd[d];
301     y = x - d;
302     v = (x - xoff) * 2 - dd;
303    
304     if (v > 12 * (c + (dd < 0 ? -dd : dd)))
305     {
306     if
307     (
308     v > best
309     &&
310     xoff + SNAKE_LIMIT <= x
311     &&
312     x < xlim
313     &&
314     yoff + SNAKE_LIMIT <= y
315     &&
316     y < ylim
317     )
318     {
319     /* We have a good enough best diagonal; now insist
320     that it end with a significant snake. */
321     int k;
322    
323     for (k = 1; xv[x - k] == yv[y - k]; k++)
324     {
325     if (k == SNAKE_LIMIT)
326     {
327     best = v;
328     part->xmid = x;
329     part->ymid = y;
330     break;
331     }
332     }
333     }
334     }
335     }
336     if (best > 0)
337     {
338     part->lo_minimal = 1;
339     part->hi_minimal = 0;
340     return 2 * c - 1;
341     }
342     best = 0;
343     for (d = bmax; d >= bmin; d -= 2)
344     {
345     int dd;
346     int x;
347     int y;
348     int v;
349    
350     dd = d - bmid;
351     x = bd[d];
352     y = x - d;
353     v = (xlim - x) * 2 + dd;
354    
355     if (v > 12 * (c + (dd < 0 ? -dd : dd)))
356     {
357     if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
358     yoff < y && y <= ylim - SNAKE_LIMIT)
359     {
360     /* We have a good enough best diagonal; now insist
361     that it end with a significant snake. */
362     int k;
363    
364     for (k = 0; xv[x + k] == yv[y + k]; k++)
365     {
366     if (k == SNAKE_LIMIT - 1)
367     {
368     best = v;
369     part->xmid = x;
370     part->ymid = y;
371     break;
372     }
373     }
374     }
375     }
376     }
377     if (best > 0)
378     {
379     part->lo_minimal = 0;
380     part->hi_minimal = 1;
381     return 2 * c - 1;
382     }
383     }
384     #endif /* MINUS_H_FLAG */
385    
386     /* Heuristic: if we've gone well beyond the call of duty, give up
387     and report halfway between our best results so far. */
388     if (c >= too_expensive)
389     {
390     int fxybest;
391     int fxbest;
392     int bxybest;
393     int bxbest;
394    
395     /* Pacify `gcc -Wall'. */
396     fxbest = 0;
397     bxbest = 0;
398    
399     /* Find forward diagonal that maximizes X + Y. */
400     fxybest = -1;
401     for (d = fmax; d >= fmin; d -= 2)
402     {
403     int x;
404     int y;
405    
406     x = fd[d] < xlim ? fd[d] : xlim;
407     y = x - d;
408    
409     if (ylim < y)
410     {
411     x = ylim + d;
412     y = ylim;
413     }
414     if (fxybest < x + y)
415     {
416     fxybest = x + y;
417     fxbest = x;
418     }
419     }
420     /* Find backward diagonal that minimizes X + Y. */
421     bxybest = INT_MAX;
422     for (d = bmax; d >= bmin; d -= 2)
423     {
424     int x;
425     int y;
426    
427     x = xoff > bd[d] ? xoff : bd[d];
428     y = x - d;
429    
430     if (y < yoff)
431     {
432     x = yoff + d;
433     y = yoff;
434     }
435     if (x + y < bxybest)
436     {
437     bxybest = x + y;
438     bxbest = x;
439     }
440     }
441     /* Use the better of the two diagonals. */
442     if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
443     {
444     part->xmid = fxbest;
445     part->ymid = fxybest - fxbest;
446     part->lo_minimal = 1;
447     part->hi_minimal = 0;
448     }
449     else
450     {
451     part->xmid = bxbest;
452     part->ymid = bxybest - bxbest;
453     part->lo_minimal = 0;
454     part->hi_minimal = 1;
455     }
456     return 2 * c - 1;
457     }
458     }
459     }
460    
461    
462     /* NAME
463     compareseq - find edit sequence
464    
465     SYNOPSIS
466     void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
467    
468     DESCRIPTION
469     Compare in detail contiguous subsequences of the two strings
470     which are known, as a whole, to match each other.
471    
472     The subsequence of string 0 is [XOFF, XLIM) and likewise for
473     string 1.
474    
475     Note that XLIM, YLIM are exclusive bounds. All character
476     numbers are origin-0.
477    
478     If MINIMAL is nonzero, find a minimal difference no matter how
479     expensive it is. */
480    
481     static void compareseq PARAMS ((int, int, int, int, int));
482    
483     static void
484     compareseq (xoff, xlim, yoff, ylim, minimal)
485     int xoff;
486     int xlim;
487     int yoff;
488     int ylim;
489     int minimal;
490     {
491 root 1.2 const UV *const xv = string[0].data; /* Help the compiler. */
492     const UV *const yv = string[1].data;
493 root 1.1
494     if (string[1].edit_count + string[0].edit_count > max_edits)
495     return;
496    
497     /* Slide down the bottom initial diagonal. */
498     while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
499     {
500     ++xoff;
501     ++yoff;
502     }
503    
504     /* Slide up the top initial diagonal. */
505     while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
506     {
507     --xlim;
508     --ylim;
509     }
510    
511     /* Handle simple cases. */
512     if (xoff == xlim)
513     {
514     while (yoff < ylim)
515     {
516     ++string[1].edit_count;
517     ++yoff;
518     }
519     }
520     else if (yoff == ylim)
521     {
522     while (xoff < xlim)
523     {
524     ++string[0].edit_count;
525     ++xoff;
526     }
527     }
528     else
529     {
530     int c;
531     struct partition part;
532    
533     /* Find a point of correspondence in the middle of the strings. */
534     c = diag (xoff, xlim, yoff, ylim, minimal, &part);
535     if (c == 1)
536     {
537     #if 0
538     /* This should be impossible, because it implies that one of
539     the two subsequences is empty, and that case was handled
540     above without calling `diag'. Let's verify that this is
541     true. */
542     abort ();
543     #else
544     /* The two subsequences differ by a single insert or delete;
545     record it and we are done. */
546     if (part.xmid - part.ymid < xoff - yoff)
547     ++string[1].edit_count;
548     else
549     ++string[0].edit_count;
550     #endif
551     }
552     else
553     {
554     /* Use the partitions to split this problem into subproblems. */
555     compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
556     compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
557     }
558     }
559     }
560    
561    
562     /* NAME
563     fstrcmp - fuzzy string compare
564    
565     SYNOPSIS
566 root 1.2 double fstrcmp(const ChaR *s1, int l1, const UV *s2, int l2, double);
567 root 1.1
568     DESCRIPTION
569     The fstrcmp function may be used to compare two string for
570     similarity. It is very useful in reducing "cascade" or
571     "secondary" errors in compilers or other situations where
572     symbol tables occur.
573    
574     RETURNS
575     double; 0 if the strings are entirly dissimilar, 1 if the
576     strings are identical, and a number in between if they are
577     similar. */
578    
579     double
580 root 1.2 fstrcmp (const UV *string1, int length1,
581     const UV *string2, int length2,
582 root 1.1 double minimum)
583     {
584     int i;
585    
586     size_t fdiag_len;
587     static int *fdiag_buf;
588     static size_t fdiag_max;
589    
590     /* set the info for each string. */
591     string[0].data = string1;
592     string[0].data_length = length1;
593     string[1].data = string2;
594     string[1].data_length = length2;
595    
596     /* short-circuit obvious comparisons */
597     if (string[0].data_length == 0 && string[1].data_length == 0)
598     return 1.0;
599     if (string[0].data_length == 0 || string[1].data_length == 0)
600     return 0.0;
601    
602     /* Set TOO_EXPENSIVE to be approximate square root of input size,
603     bounded below by 256. */
604     too_expensive = 1;
605     for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
606     too_expensive <<= 1;
607     if (too_expensive < 256)
608     too_expensive = 256;
609    
610     /* Because fstrcmp is typically called multiple times, while scanning
611     symbol tables, etc, attempt to minimize the number of memory
612     allocations performed. Thus, we use a static buffer for the
613     diagonal vectors, and never free them. */
614     fdiag_len = string[0].data_length + string[1].data_length + 3;
615     if (fdiag_len > fdiag_max)
616     {
617     fdiag_max = fdiag_len;
618     fdiag_buf = realloc (fdiag_buf, fdiag_max * (2 * sizeof (int)));
619     }
620     fdiag = fdiag_buf + string[1].data_length + 1;
621     bdiag = fdiag + fdiag_len;
622    
623     max_edits = 1 + (string[0].data_length + string[1].data_length) * (1. - minimum);
624    
625     /* Now do the main comparison algorithm */
626     string[0].edit_count = 0;
627     string[1].edit_count = 0;
628     compareseq (0, string[0].data_length, 0, string[1].data_length, 0);
629    
630     /* The result is
631     ((number of chars in common) / (average length of the strings)).
632     This is admittedly biased towards finding that the strings are
633     similar, however it does produce meaningful results. */
634     return ((double)
635     (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)
636     / (string[0].data_length + string[1].data_length));
637    
638     }