libexplain
1.4.D001
|
00001 /* 00002 * cook - file construction tool 00003 * Copyright (C) 1991, 1993, 1994, 1997, 2001, 2005-2009, 2013 Peter Miller 00004 * 00005 * This program is free software; you can redistribute it and/or modify 00006 * it under the terms of the GNU Lesser General Public License as published by 00007 * the Free Software Foundation; either version 3 of the License, or (at 00008 * your option) any later version. 00009 * 00010 * This program is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00013 * Lesser General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU Lesser General Public License 00016 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00017 * 00018 * 00019 * This code is based on the heart of a file comparison program 00020 * written by David I. Bell, and used by kind permission. 00021 * This notice must be retained in all copies and derivatives. 00022 * Contact the author of libexplain for a copy of the file comparison program. 00023 * 00024 * This code is based on the algorithm in: 00025 * An O(ND) Difference Algorithm and Its Variations 00026 * Eugene W. Myers 00027 * (TR 85-6, April 10, 1985) 00028 * Department of Computer Science 00029 * The University of Arizona 00030 * Tuscon, Arizona 85721 00031 * 00032 * Also see: 00033 * A File Comparison Program 00034 * Webb Miller and Eugene W. Myers 00035 * Software Practice and Experience 00036 * (Volume 15, No. 11, November 1985) 00037 */ 00038 00039 #include <libexplain/ac/assert.h> 00040 #include <libexplain/ac/ctype.h> 00041 #include <libexplain/ac/stdlib.h> 00042 #include <libexplain/ac/string.h> 00043 00044 #include <libexplain/fstrcmp.h> 00045 00046 00047 typedef struct snake_t snake_t; 00048 struct snake_t 00049 { 00050 long line1; 00051 long line2; 00052 long count; 00053 snake_t *next; 00054 }; 00055 00056 static long tablesize; /* needed table size */ 00057 static long tablesize_max; /* allocated table size */ 00058 static long *V1; /* the row containing the last d */ 00059 static long *V1_table; 00060 static long *V2; /* another row */ 00061 static long *V2_table; 00062 static snake_t *nextsnake; /* next allocable snake structure */ 00063 static snake_t *snake_table; /* allocable snake structures */ 00064 00065 typedef struct file file; 00066 struct file 00067 { 00068 const char *f_lines; 00069 long f_linecount; 00070 }; 00071 00072 typedef struct fc_t fc_t; 00073 struct fc_t 00074 { 00075 file fileA; 00076 file fileB; 00077 long maxlines; 00078 long minlines; 00079 long inserts; 00080 long deletes; 00081 long matches; 00082 }; 00083 00084 static fc_t fc; 00085 00086 00087 /* 00088 * Routine to find the middle snake of an optimial D-path spanning 00089 * lines A to A+N in file A to lines B to B+N in file B. Returns the 00090 * length D of the D-path as a return value, and the upper left and 00091 * lower right relative coordinates of a snake midway through the D-path. 00092 */ 00093 00094 static long 00095 midsnake(int depth, long A, long N, long B, long M, long *ulx, long *uly, 00096 long *lrx, long *lry) 00097 { 00098 long x; 00099 long y; 00100 long k; 00101 long oldx; 00102 const char *lp1; 00103 const char *lp2; 00104 long DELTA; 00105 long odd; 00106 long MAXD; 00107 long changes; 00108 long D; 00109 00110 (void)depth; 00111 00112 DELTA = N - M; 00113 odd = DELTA & 1; 00114 MAXD = (M + N + 1) / 2; 00115 V1[1] = 0; 00116 V2[-1] = 0; 00117 changes = -odd - 2; 00118 00119 /* 00120 * This is the main loop for searching for the snake. 00121 * D is the distance off the diagonals, and is the number 00122 * of changes needed to get from the upper left to the 00123 * lower right corner of the region. 00124 */ 00125 for (D = 0; D <= MAXD; D++) 00126 { 00127 changes += 2; 00128 00129 /* 00130 * Examine all diagonals within current distance. 00131 * First search from upper left to lower right, 00132 * and then search from lower right to upper left. 00133 */ 00134 for (k = -D; k <= D; k += 2) 00135 { 00136 /* 00137 * Find the end of the furthest forward D-path 00138 * in diagonal k. 00139 */ 00140 if (k == -D || (k != D && (V1[k - 1] < V1[k + 1]))) 00141 x = V1[k + 1]; 00142 else 00143 x = V1[k - 1] + 1; 00144 y = x - k; 00145 lp1 = &fc.fileA.f_lines[A + x]; 00146 lp2 = &fc.fileB.f_lines[B + y]; 00147 oldx = x; 00148 while (x < N && y < M && *lp1 == *lp2) 00149 { 00150 x++; 00151 y++; 00152 lp1++; 00153 lp2++; 00154 } 00155 V1[k] = x; 00156 00157 /* 00158 * See if path overlaps furthest reverse D-path. 00159 * If so, then we have found the snake. 00160 */ 00161 if (odd && k >= (DELTA - (D - 1)) && k <= (DELTA + (D - 1))) 00162 { 00163 if ((x + V2[k - DELTA]) >= N) 00164 { 00165 *ulx = oldx; 00166 *uly = oldx - k; 00167 *lrx = x; 00168 *lry = y; 00169 return changes; 00170 } 00171 } 00172 } 00173 00174 for (k = -D; k <= D; k += 2) 00175 { 00176 /* 00177 * Find the end of the furthest reaching reverse 00178 * path in diagonal k+DELTA. 00179 */ 00180 if (k == D || (k != -D && (V2[k + 1] < V2[k - 1]))) 00181 x = V2[k - 1]; 00182 else 00183 x = V2[k + 1] + 1; 00184 y = x + k; 00185 lp1 = &fc.fileA.f_lines[A + N - x - 1]; 00186 lp2 = &fc.fileB.f_lines[B + M - y - 1]; 00187 oldx = x; 00188 while (x < N && y < M && *lp1 == *lp2) 00189 { 00190 x++; 00191 y++; 00192 lp1--; 00193 lp2--; 00194 } 00195 V2[k] = x; 00196 00197 /* 00198 * See if path overlaps furthest forward D-path. 00199 * If so, then we have found the snake. 00200 */ 00201 if (!odd && (k <= D - DELTA) && (k >= -D - DELTA)) 00202 { 00203 if ((x + V1[k + DELTA]) >= N) 00204 { 00205 *ulx = N - x; 00206 *uly = M - y; 00207 *lrx = N - oldx; 00208 *lry = *lrx + *uly - *ulx; 00209 return changes; 00210 } 00211 } 00212 } 00213 } 00214 00215 /* 00216 * Middle snake procedure failed! 00217 */ 00218 assert(0); 00219 return 0; 00220 } 00221 00222 00223 /* 00224 * Recursive routine to find a minimal D-path through the edit graph 00225 * of the two input files. Arguments are the beginning line numbers in 00226 * the files, and the number of lines to examine. This is basically a 00227 * divide-and-conquer routine which finds the middle snake of an optimal 00228 * D-path, then calls itself to find the remainder of the path before the 00229 * snake and after the snake. 00230 */ 00231 00232 static void 00233 findsnake(int depth, long A, long N, long B, long M) 00234 { 00235 snake_t *sp; 00236 long ulx = 0; 00237 long uly = 0; 00238 long lrx = 0; 00239 long lry = 0; 00240 long D; 00241 long count; 00242 00243 /* 00244 * If more than one change needed, then call ourself for each part. 00245 */ 00246 D = midsnake(depth, A, N, B, M, &ulx, &uly, &lrx, &lry); 00247 00248 if (D > 1) 00249 { 00250 if (ulx > 0 && uly > 0) 00251 findsnake(depth + 1, A, ulx, B, uly); 00252 count = lrx - ulx; 00253 sp = nextsnake++; 00254 sp->line1 = A + ulx; 00255 sp->line2 = B + uly; 00256 sp->count = count; 00257 N -= lrx; 00258 M -= lry; 00259 if (N > 0 && M > 0) 00260 findsnake(depth + 1, A + lrx, N, B + lry, M); 00261 return; 00262 } 00263 00264 /* 00265 * Only 0 or 1 change needed, so we can compute the result directly. 00266 * First compute the snake coming from the upper left corner if any. 00267 */ 00268 if (N > M) 00269 count = uly; 00270 else 00271 count = ulx; 00272 sp = nextsnake++; 00273 sp->line1 = A; 00274 sp->line2 = B; 00275 sp->count = count; 00276 00277 /* 00278 * Finally compute the snake coming from the lower right corner if any. 00279 */ 00280 count = lrx - ulx; 00281 sp = nextsnake++; 00282 sp->line1 = A + ulx; 00283 sp->line2 = B + uly; 00284 sp->count = count; 00285 } 00286 00287 00288 double 00289 explain_fstrcmp(const char *s1, const char *s2) 00290 { 00291 double result; 00292 snake_t *sp; /* current snake element */ 00293 long line1; /* current line in file A */ 00294 long line2; /* current line in file B */ 00295 00296 fc.fileA.f_lines = s1; 00297 fc.fileA.f_linecount = strlen(s1); 00298 fc.fileB.f_lines = s2; 00299 fc.fileB.f_linecount = strlen(s2); 00300 00301 /* 00302 * Check for trivial case of two empty strings. 00303 * This also avoids a division by zero at the end of is function. 00304 */ 00305 if (!fc.fileA.f_linecount && !fc.fileB.f_linecount) 00306 { 00307 return 1; 00308 } 00309 00310 if (fc.fileA.f_linecount < fc.fileB.f_linecount) 00311 { 00312 fc.minlines = fc.fileA.f_linecount; 00313 fc.maxlines = fc.fileB.f_linecount; 00314 } 00315 else 00316 { 00317 fc.minlines = fc.fileB.f_linecount; 00318 fc.maxlines = fc.fileA.f_linecount; 00319 } 00320 00321 tablesize = fc.maxlines * 2 + 1; 00322 if (tablesize > tablesize_max) 00323 { 00324 tablesize_max = tablesize; 00325 if (V1_table) 00326 free(V1_table); 00327 V1_table = malloc(sizeof(*V1_table) * tablesize_max); 00328 if (!V1_table) 00329 return -1; 00330 if (V2_table) 00331 free(V2_table); 00332 V2_table = malloc(sizeof(*V2_table) * tablesize_max); 00333 if (!V2_table) 00334 return -1; 00335 if (snake_table) 00336 free(snake_table); 00337 snake_table = malloc(sizeof(*snake_table) * tablesize_max); 00338 if (!snake_table) 00339 return -1; 00340 } 00341 00342 V1 = V1_table + fc.maxlines; 00343 V2 = V2_table + fc.maxlines; 00344 nextsnake = snake_table; 00345 if (fc.fileA.f_linecount > 0 && fc.fileB.f_linecount > 0) 00346 { 00347 findsnake(0, 0L, fc.fileA.f_linecount, 0L, fc.fileB.f_linecount); 00348 } 00349 00350 /* 00351 * End the list with the lower right endpoint 00352 */ 00353 sp = nextsnake++; 00354 sp->line1 = fc.fileA.f_linecount; 00355 sp->line2 = fc.fileB.f_linecount; 00356 sp->count = 0; 00357 00358 #if 0 00359 /* 00360 * print out the snake list 00361 */ 00362 for (sp = snake_table; sp < nextsnake; sp++) 00363 { 00364 printf("%d: line1 = %ld; line2 = %ld; count = %ld;\n", 00365 sp - snake_table, sp->line1, sp->line2, sp->count); 00366 } 00367 #endif 00368 00369 /* 00370 * Scan the snake list and calculate the number of inserted, 00371 * deleted, and matching lines. 00372 */ 00373 line1 = 0; 00374 line2 = 0; 00375 fc.deletes = 0; 00376 fc.inserts = 0; 00377 fc.matches = 0; 00378 for (sp = snake_table; sp < nextsnake; sp++) 00379 { 00380 fc.deletes += (sp->line1 - line1); 00381 fc.inserts += (sp->line2 - line2); 00382 fc.matches += sp->count; 00383 line1 = sp->line1 + sp->count; 00384 line2 = sp->line2 + sp->count; 00385 } 00386 00387 /* 00388 * the result is 0 if the strings are entirely unalike, 00389 * and 1 if the strings are identical, and somewhere in between 00390 * if the are in any way similar. 00391 */ 00392 result = 00393 ( 00394 1 00395 - 00396 ( 00397 (double)(fc.inserts + fc.deletes) 00398 / 00399 (fc.fileA.f_linecount + fc.fileB.f_linecount) 00400 ) 00401 ); 00402 return result; 00403 } 00404 00405 00406 static void 00407 downcase(char *out, const char *in) 00408 { 00409 for (;;) 00410 { 00411 unsigned char c; 00412 00413 c = *in++; 00414 if (isupper(c)) 00415 c = tolower(c); 00416 *out++ = c; 00417 if (!c) 00418 return; 00419 } 00420 } 00421 00422 00423 double 00424 explain_fstrcasecmp(const char *s1, const char *s2) 00425 { 00426 size_t len1; 00427 char *lc1; 00428 size_t len2; 00429 char *lc2; 00430 double result; 00431 00432 len1 = strlen(s1) + 1; 00433 lc1 = malloc(len1); 00434 if (!lc1) 00435 return explain_fstrcmp(s1, s2); 00436 downcase(lc1, s1); 00437 00438 len2 = strlen(s2) + 1; 00439 lc2 = malloc(len2); 00440 if (!lc2) 00441 { 00442 result = explain_fstrcmp(lc1, s2); 00443 free(lc1); 00444 return result; 00445 } 00446 downcase(lc2, s2); 00447 00448 result = explain_fstrcmp(lc1, lc2); 00449 free(lc1); 00450 free(lc2); 00451 return result; 00452 } 00453 00454 00455 /* vim: set ts=8 sw=4 et : */