mirror of
https://github.com/git/git.git
synced 2024-11-17 22:44:49 +01:00
Geert's similarity
Define a function to compute similarity score 0.0<=score<=1.0 Signed-off-by: Junio C Hamano <junkio@cox.net>
This commit is contained in:
parent
fd2bbdd238
commit
9a305b67f8
3 changed files with 39 additions and 24 deletions
29
gsimm.c
29
gsimm.c
|
@ -1,3 +1,4 @@
|
||||||
|
#include <string.h>
|
||||||
#include "rabinpoly.h"
|
#include "rabinpoly.h"
|
||||||
#include "gsimm.h"
|
#include "gsimm.h"
|
||||||
|
|
||||||
|
@ -32,6 +33,29 @@ static void freq_to_md(u_char *md, int *freq)
|
||||||
bzero (freq, sizeof(freq[0]) * MD_BITS);
|
bzero (freq, sizeof(freq[0]) * MD_BITS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int dist (u_char *l, u_char *r)
|
||||||
|
{ int j, k;
|
||||||
|
int d = 0;
|
||||||
|
|
||||||
|
for (j = 0; j < MD_LENGTH; j++)
|
||||||
|
{ u_char ch = l[j] ^ r[j];
|
||||||
|
|
||||||
|
for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
double gb_simm_score(u_char *l, u_char *r)
|
||||||
|
{
|
||||||
|
int d = dist(l, r);
|
||||||
|
double sim = (double) (d) / (MD_LENGTH * 4 - 1);
|
||||||
|
if (1.0 < sim)
|
||||||
|
return 0;
|
||||||
|
else
|
||||||
|
return 1.0 - sim;
|
||||||
|
}
|
||||||
|
|
||||||
void gb_simm_process(u_char *data, unsigned len, u_char *md)
|
void gb_simm_process(u_char *data, unsigned len, u_char *md)
|
||||||
{ size_t j = 0;
|
{ size_t j = 0;
|
||||||
u_int32_t ofs;
|
u_int32_t ofs;
|
||||||
|
@ -39,6 +63,11 @@ void gb_simm_process(u_char *data, unsigned len, u_char *md)
|
||||||
u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)];
|
u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)];
|
||||||
int freq[MD_BITS];
|
int freq[MD_BITS];
|
||||||
|
|
||||||
|
if (len < GB_SIMM_MIN_FILE_SIZE || GB_SIMM_MAX_FILE_SIZE < len) {
|
||||||
|
memset(md, 0, MD_LENGTH);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
bzero (freq, sizeof(freq[0]) * MD_BITS);
|
bzero (freq, sizeof(freq[0]) * MD_BITS);
|
||||||
bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t));
|
bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t));
|
||||||
bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t)));
|
bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t)));
|
||||||
|
|
5
gsimm.h
5
gsimm.h
|
@ -15,14 +15,15 @@
|
||||||
In order to get at least an average of 12 samples
|
In order to get at least an average of 12 samples
|
||||||
per bit in the final message digest, require at least 3 * MD_LENGTH
|
per bit in the final message digest, require at least 3 * MD_LENGTH
|
||||||
complete windows in the file. */
|
complete windows in the file. */
|
||||||
#define MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1))
|
#define GB_SIMM_MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1))
|
||||||
|
|
||||||
/* Limit matching algorithm to files less than 256 MB, so we can use
|
/* Limit matching algorithm to files less than 256 MB, so we can use
|
||||||
32 bit integers everywhere without fear of overflow. For larger
|
32 bit integers everywhere without fear of overflow. For larger
|
||||||
files we should add logic to mmap the file by piece and accumulate
|
files we should add logic to mmap the file by piece and accumulate
|
||||||
the frequency counts. */
|
the frequency counts. */
|
||||||
#define MAX_FILE_SIZE (256*1024*1024 - 1)
|
#define GB_SIMM_MAX_FILE_SIZE (256*1024*1024 - 1)
|
||||||
|
|
||||||
void gb_simm_process(u_char *data, unsigned len, u_char *md);
|
void gb_simm_process(u_char *data, unsigned len, u_char *md);
|
||||||
|
double gb_simm_score(u_char *l, u_char *r);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
23
test-gsimm.c
23
test-gsimm.c
|
@ -58,19 +58,6 @@ void usage()
|
||||||
exit (1);
|
exit (1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int dist (u_char *l, u_char *r)
|
|
||||||
{ int j, k;
|
|
||||||
int d = 0;
|
|
||||||
|
|
||||||
for (j = 0; j < MD_LENGTH; j++)
|
|
||||||
{ u_char ch = l[j] ^ r[j];
|
|
||||||
|
|
||||||
for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
return d;
|
|
||||||
}
|
|
||||||
|
|
||||||
char *md_to_str(u_char *md)
|
char *md_to_str(u_char *md)
|
||||||
{ int j;
|
{ int j;
|
||||||
|
|
||||||
|
@ -102,8 +89,8 @@ void process_file (char *name)
|
||||||
exit (2);
|
exit (2);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fs.st_size >= MIN_FILE_SIZE
|
if (fs.st_size >= GB_SIMM_MIN_FILE_SIZE
|
||||||
&& fs.st_size <= MAX_FILE_SIZE)
|
&& fs.st_size <= GB_SIMM_MAX_FILE_SIZE)
|
||||||
{ fi->length = fs.st_size;
|
{ fi->length = fs.st_size;
|
||||||
fi->name = name;
|
fi->name = name;
|
||||||
|
|
||||||
|
@ -116,13 +103,11 @@ void process_file (char *name)
|
||||||
|
|
||||||
gb_simm_process (data, fs.st_size, fi->md);
|
gb_simm_process (data, fs.st_size, fi->md);
|
||||||
if (flag_relative)
|
if (flag_relative)
|
||||||
{ int d = dist (fi->md, relative_md);
|
|
||||||
double sim = 1.0 - MIN (1.0, (double) (d) / (MD_LENGTH * 4 - 1));
|
|
||||||
fprintf (stdout, "%s %llu %u %s %u %3.1f\n",
|
fprintf (stdout, "%s %llu %u %s %u %3.1f\n",
|
||||||
md_to_str (fi->md), (long long unsigned) 0,
|
md_to_str (fi->md), (long long unsigned) 0,
|
||||||
(unsigned) fs.st_size, name,
|
(unsigned) fs.st_size, name,
|
||||||
d, 100.0 * sim);
|
(unsigned) 0,
|
||||||
}
|
100.0 * gb_simm_score(fi->md, relative_md));
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf (stdout, "%s %llu %u %s\n",
|
fprintf (stdout, "%s %llu %u %s\n",
|
||||||
|
|
Loading…
Reference in a new issue