libexplain  1.4.D001
libexplain/fstrcmp.c
Go to the documentation of this file.
00001 /*
00002  * cook - file construction tool
00003  * Copyright (C) 1991, 1993, 1994, 1997, 2001, 2005-2009, 2013 Peter Miller
00004  *
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU Lesser General Public License as published by
00007  * the Free Software Foundation; either version 3 of the License, or (at
00008  * your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013  * Lesser General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU Lesser General Public License
00016  * along with this program. If not, see <http://www.gnu.org/licenses/>.
00017  *
00018  *
00019  * This code is based on the heart of a file comparison program
00020  * written by David I. Bell, and used by kind permission.
00021  * This notice must be retained in all copies and derivatives.
00022  * Contact the author of libexplain for a copy of the file comparison program.
00023  *
00024  * This code is based on the algorithm in:
00025  *      An O(ND) Difference Algorithm and Its Variations
00026  *      Eugene W. Myers
00027  *      (TR 85-6, April 10, 1985)
00028  *      Department of Computer Science
00029  *      The University of Arizona
00030  *      Tuscon, Arizona 85721
00031  *
00032  * Also see:
00033  *      A File Comparison Program
00034  *      Webb Miller and Eugene W. Myers
00035  *      Software Practice and Experience
00036  *      (Volume 15, No. 11, November 1985)
00037  */
00038 
00039 #include <libexplain/ac/assert.h>
00040 #include <libexplain/ac/ctype.h>
00041 #include <libexplain/ac/stdlib.h>
00042 #include <libexplain/ac/string.h>
00043 
00044 #include <libexplain/fstrcmp.h>
00045 
00046 
00047 typedef struct snake_t snake_t;
00048 struct snake_t
00049 {
00050     long            line1;
00051     long            line2;
00052     long            count;
00053     snake_t         *next;
00054 };
00055 
00056 static long     tablesize;      /* needed table size */
00057 static long     tablesize_max;  /* allocated table size */
00058 static long     *V1;            /* the row containing the last d */
00059 static long     *V1_table;
00060 static long     *V2;            /* another row */
00061 static long     *V2_table;
00062 static snake_t  *nextsnake;     /* next allocable snake structure */
00063 static snake_t  *snake_table;   /* allocable snake structures */
00064 
00065 typedef struct file file;
00066 struct file
00067 {
00068     const char      *f_lines;
00069     long            f_linecount;
00070 };
00071 
00072 typedef struct fc_t fc_t;
00073 struct fc_t
00074 {
00075     file            fileA;
00076     file            fileB;
00077     long            maxlines;
00078     long            minlines;
00079     long            inserts;
00080     long            deletes;
00081     long            matches;
00082 };
00083 
00084 static fc_t     fc;
00085 
00086 
00087 /*
00088  * Routine to find the middle snake of an optimial D-path spanning
00089  * lines A to A+N in file A to lines B to B+N in file B.  Returns the
00090  * length D of the D-path as a return value, and the upper left and
00091  * lower right relative coordinates of a snake midway through the D-path.
00092  */
00093 
00094 static long
00095 midsnake(int depth, long A, long N, long B, long M, long *ulx, long *uly,
00096     long *lrx, long *lry)
00097 {
00098     long            x;
00099     long            y;
00100     long            k;
00101     long            oldx;
00102     const char      *lp1;
00103     const char      *lp2;
00104     long            DELTA;
00105     long            odd;
00106     long            MAXD;
00107     long            changes;
00108     long            D;
00109 
00110     (void)depth;
00111 
00112     DELTA = N - M;
00113     odd = DELTA & 1;
00114     MAXD = (M + N + 1) / 2;
00115     V1[1] = 0;
00116     V2[-1] = 0;
00117     changes = -odd - 2;
00118 
00119     /*
00120      * This is the main loop for searching for the snake.
00121      * D is the distance off the diagonals, and is the number
00122      * of changes needed to get from the upper left to the
00123      * lower right corner of the region.
00124      */
00125     for (D = 0; D <= MAXD; D++)
00126     {
00127         changes += 2;
00128 
00129         /*
00130          * Examine all diagonals within current distance.
00131          * First search from upper left to lower right,
00132          * and then search from lower right to upper left.
00133          */
00134         for (k = -D; k <= D; k += 2)
00135         {
00136             /*
00137              * Find the end of the furthest forward D-path
00138              * in diagonal k.
00139              */
00140             if (k == -D || (k != D && (V1[k - 1] < V1[k + 1])))
00141                 x = V1[k + 1];
00142             else
00143                 x = V1[k - 1] + 1;
00144             y = x - k;
00145             lp1 = &fc.fileA.f_lines[A + x];
00146             lp2 = &fc.fileB.f_lines[B + y];
00147             oldx = x;
00148             while (x < N && y < M && *lp1 == *lp2)
00149             {
00150                 x++;
00151                 y++;
00152                 lp1++;
00153                 lp2++;
00154             }
00155             V1[k] = x;
00156 
00157             /*
00158              * See if path overlaps furthest reverse D-path.
00159              * If so, then we have found the snake.
00160              */
00161             if (odd && k >= (DELTA - (D - 1)) && k <= (DELTA + (D - 1)))
00162             {
00163                 if ((x + V2[k - DELTA]) >= N)
00164                 {
00165                     *ulx = oldx;
00166                     *uly = oldx - k;
00167                     *lrx = x;
00168                     *lry = y;
00169                     return changes;
00170                 }
00171             }
00172         }
00173 
00174         for (k = -D; k <= D; k += 2)
00175         {
00176             /*
00177              * Find the end of the furthest reaching reverse
00178              * path in diagonal k+DELTA.
00179              */
00180             if (k == D || (k != -D && (V2[k + 1] < V2[k - 1])))
00181                 x = V2[k - 1];
00182             else
00183                 x = V2[k + 1] + 1;
00184             y = x + k;
00185             lp1 = &fc.fileA.f_lines[A + N - x - 1];
00186             lp2 = &fc.fileB.f_lines[B + M - y - 1];
00187             oldx = x;
00188             while (x < N && y < M && *lp1 == *lp2)
00189             {
00190                 x++;
00191                 y++;
00192                 lp1--;
00193                 lp2--;
00194             }
00195             V2[k] = x;
00196 
00197             /*
00198              * See if path overlaps furthest forward D-path.
00199              * If so, then we have found the snake.
00200              */
00201             if (!odd && (k <= D - DELTA) && (k >= -D - DELTA))
00202             {
00203                 if ((x + V1[k + DELTA]) >= N)
00204                 {
00205                     *ulx = N - x;
00206                     *uly = M - y;
00207                     *lrx = N - oldx;
00208                     *lry = *lrx + *uly - *ulx;
00209                     return changes;
00210                 }
00211             }
00212         }
00213     }
00214 
00215     /*
00216      * Middle snake procedure failed!
00217      */
00218     assert(0);
00219     return 0;
00220 }
00221 
00222 
00223 /*
00224  * Recursive routine to find a minimal D-path through the edit graph
00225  * of the two input files.  Arguments are the beginning line numbers in
00226  * the files, and the number of lines to examine.  This is basically a
00227  * divide-and-conquer routine which finds the middle snake of an optimal
00228  * D-path, then calls itself to find the remainder of the path before the
00229  * snake and after the snake.
00230  */
00231 
00232 static void
00233 findsnake(int depth, long A, long N, long B, long M)
00234 {
00235     snake_t         *sp;
00236     long            ulx = 0;
00237     long            uly = 0;
00238     long            lrx = 0;
00239     long            lry = 0;
00240     long            D;
00241     long            count;
00242 
00243     /*
00244      * If more than one change needed, then call ourself for each part.
00245      */
00246     D = midsnake(depth, A, N, B, M, &ulx, &uly, &lrx, &lry);
00247 
00248     if (D > 1)
00249     {
00250         if (ulx > 0 && uly > 0)
00251             findsnake(depth + 1, A, ulx, B, uly);
00252         count = lrx - ulx;
00253         sp = nextsnake++;
00254         sp->line1 = A + ulx;
00255         sp->line2 = B + uly;
00256         sp->count = count;
00257         N -= lrx;
00258         M -= lry;
00259         if (N > 0 && M > 0)
00260             findsnake(depth + 1, A + lrx, N, B + lry, M);
00261         return;
00262     }
00263 
00264     /*
00265      * Only 0 or 1 change needed, so we can compute the result directly.
00266      * First compute the snake coming from the upper left corner if any.
00267      */
00268     if (N > M)
00269         count = uly;
00270     else
00271         count = ulx;
00272     sp = nextsnake++;
00273     sp->line1 = A;
00274     sp->line2 = B;
00275     sp->count = count;
00276 
00277     /*
00278      * Finally compute the snake coming from the lower right corner if any.
00279      */
00280     count = lrx - ulx;
00281     sp = nextsnake++;
00282     sp->line1 = A + ulx;
00283     sp->line2 = B + uly;
00284     sp->count = count;
00285 }
00286 
00287 
00288 double
00289 explain_fstrcmp(const char *s1, const char *s2)
00290 {
00291     double          result;
00292     snake_t         *sp;        /* current snake element */
00293     long            line1;      /* current line in file A */
00294     long            line2;      /* current line in file B */
00295 
00296     fc.fileA.f_lines = s1;
00297     fc.fileA.f_linecount = strlen(s1);
00298     fc.fileB.f_lines = s2;
00299     fc.fileB.f_linecount = strlen(s2);
00300 
00301     /*
00302      * Check for trivial case of two empty strings.
00303      * This also avoids a division by zero at the end of is function.
00304      */
00305     if (!fc.fileA.f_linecount && !fc.fileB.f_linecount)
00306     {
00307         return 1;
00308     }
00309 
00310     if (fc.fileA.f_linecount < fc.fileB.f_linecount)
00311     {
00312         fc.minlines = fc.fileA.f_linecount;
00313         fc.maxlines = fc.fileB.f_linecount;
00314     }
00315     else
00316     {
00317         fc.minlines = fc.fileB.f_linecount;
00318         fc.maxlines = fc.fileA.f_linecount;
00319     }
00320 
00321     tablesize = fc.maxlines * 2 + 1;
00322     if (tablesize > tablesize_max)
00323     {
00324         tablesize_max = tablesize;
00325         if (V1_table)
00326             free(V1_table);
00327         V1_table = malloc(sizeof(*V1_table) * tablesize_max);
00328         if (!V1_table)
00329             return -1;
00330         if (V2_table)
00331             free(V2_table);
00332         V2_table = malloc(sizeof(*V2_table) * tablesize_max);
00333         if (!V2_table)
00334             return -1;
00335         if (snake_table)
00336             free(snake_table);
00337         snake_table = malloc(sizeof(*snake_table) * tablesize_max);
00338         if (!snake_table)
00339             return -1;
00340     }
00341 
00342     V1 = V1_table + fc.maxlines;
00343     V2 = V2_table + fc.maxlines;
00344     nextsnake = snake_table;
00345     if (fc.fileA.f_linecount > 0 && fc.fileB.f_linecount > 0)
00346     {
00347         findsnake(0, 0L, fc.fileA.f_linecount, 0L, fc.fileB.f_linecount);
00348     }
00349 
00350     /*
00351      * End the list with the lower right endpoint
00352      */
00353     sp = nextsnake++;
00354     sp->line1 = fc.fileA.f_linecount;
00355     sp->line2 = fc.fileB.f_linecount;
00356     sp->count = 0;
00357 
00358 #if 0
00359     /*
00360      * print out the snake list
00361      */
00362     for (sp = snake_table; sp < nextsnake; sp++)
00363     {
00364         printf("%d: line1 = %ld; line2 = %ld; count = %ld;\n",
00365             sp - snake_table, sp->line1, sp->line2, sp->count);
00366     }
00367 #endif
00368 
00369     /*
00370      * Scan the snake list and calculate the number of inserted,
00371      * deleted, and matching lines.
00372      */
00373     line1 = 0;
00374     line2 = 0;
00375     fc.deletes = 0;
00376     fc.inserts = 0;
00377     fc.matches = 0;
00378     for (sp = snake_table; sp < nextsnake; sp++)
00379     {
00380         fc.deletes += (sp->line1 - line1);
00381         fc.inserts += (sp->line2 - line2);
00382         fc.matches += sp->count;
00383         line1 = sp->line1 + sp->count;
00384         line2 = sp->line2 + sp->count;
00385     }
00386 
00387     /*
00388      * the result is 0 if the strings are entirely unalike,
00389      * and 1 if the strings are identical, and somewhere in between
00390      * if the are in any way similar.
00391      */
00392     result =
00393         (
00394             1
00395         -
00396             (
00397                 (double)(fc.inserts + fc.deletes)
00398             /
00399                 (fc.fileA.f_linecount + fc.fileB.f_linecount)
00400             )
00401         );
00402     return result;
00403 }
00404 
00405 
00406 static void
00407 downcase(char *out, const char *in)
00408 {
00409     for (;;)
00410     {
00411         unsigned char   c;
00412 
00413         c = *in++;
00414         if (isupper(c))
00415             c = tolower(c);
00416         *out++ = c;
00417         if (!c)
00418             return;
00419     }
00420 }
00421 
00422 
00423 double
00424 explain_fstrcasecmp(const char *s1, const char *s2)
00425 {
00426     size_t          len1;
00427     char            *lc1;
00428     size_t          len2;
00429     char            *lc2;
00430     double          result;
00431 
00432     len1 = strlen(s1) + 1;
00433     lc1 = malloc(len1);
00434     if (!lc1)
00435         return explain_fstrcmp(s1, s2);
00436     downcase(lc1, s1);
00437 
00438     len2 = strlen(s2) + 1;
00439     lc2 = malloc(len2);
00440     if (!lc2)
00441     {
00442         result = explain_fstrcmp(lc1, s2);
00443         free(lc1);
00444         return result;
00445     }
00446     downcase(lc2, s2);
00447 
00448     result = explain_fstrcmp(lc1, lc2);
00449     free(lc1);
00450     free(lc2);
00451     return result;
00452 }
00453 
00454 
00455 /* vim: set ts=8 sw=4 et : */