From f5948cfe673ec65c09b926249cc02ab4451243fa Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 4 Mar 2006 02:51:19 -0800 Subject: [PATCH 1/3] count-delta: no need for this anymore. This is a companion patch to e29e1147e485654d90a0ea0fd5fb7151bb194265 which made diffcore similarity estimator independent from the packfile deltifier. There is no reason for us to be counting the xdelta anymore. Signed-off-by: Junio C Hamano --- Makefile | 4 +-- count-delta.c | 72 --------------------------------------------------- count-delta.h | 10 ------- 3 files changed, 2 insertions(+), 84 deletions(-) delete mode 100644 count-delta.c delete mode 100644 count-delta.h diff --git a/Makefile b/Makefile index b6d8804d4b..ab2890d638 100644 --- a/Makefile +++ b/Makefile @@ -190,7 +190,7 @@ PYMODULES = \ LIB_FILE=libgit.a LIB_H = \ - blob.h cache.h commit.h count-delta.h csum-file.h delta.h \ + blob.h cache.h commit.h csum-file.h delta.h \ diff.h object.h pack.h pkt-line.h quote.h refs.h \ run-command.h strbuf.h tag.h tree.h git-compat-util.h revision.h @@ -200,7 +200,7 @@ DIFF_OBJS = \ diffcore-delta.o LIB_OBJS = \ - blob.o commit.o connect.o count-delta.o csum-file.o \ + blob.o commit.o connect.o csum-file.o \ date.o diff-delta.o entry.o exec_cmd.o ident.o index.o \ object.o pack-check.o patch-delta.o path.o pkt-line.o \ quote.o read-cache.o refs.o run-command.o \ diff --git a/count-delta.c b/count-delta.c deleted file mode 100644 index 058a2aadb1..0000000000 --- a/count-delta.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (C) 2005 Junio C Hamano - * The delta-parsing part is almost straight copy of patch-delta.c - * which is (C) 2005 Nicolas Pitre . - */ -#include -#include -#include -#include "delta.h" -#include "count-delta.h" - -/* - * NOTE. We do not _interpret_ delta fully. As an approximation, we - * just count the number of bytes that are copied from the source, and - * the number of literal data bytes that are inserted. - * - * Number of bytes that are _not_ copied from the source is deletion, - * and number of inserted literal bytes are addition, so sum of them - * is the extent of damage. - */ -int count_delta(void *delta_buf, unsigned long delta_size, - unsigned long *src_copied, unsigned long *literal_added) -{ - unsigned long copied_from_source, added_literal; - const unsigned char *data, *top; - unsigned char cmd; - unsigned long src_size, dst_size, out; - - if (delta_size < DELTA_SIZE_MIN) - return -1; - - data = delta_buf; - top = delta_buf + delta_size; - - src_size = get_delta_hdr_size(&data); - dst_size = get_delta_hdr_size(&data); - - added_literal = copied_from_source = out = 0; - while (data < top) { - cmd = *data++; - if (cmd & 0x80) { - unsigned long cp_off = 0, cp_size = 0; - if (cmd & 0x01) cp_off = *data++; - if (cmd & 0x02) cp_off |= (*data++ << 8); - if (cmd & 0x04) cp_off |= (*data++ << 16); - if (cmd & 0x08) cp_off |= (*data++ << 24); - if (cmd & 0x10) cp_size = *data++; - if (cmd & 0x20) cp_size |= (*data++ << 8); - if (cmd & 0x40) cp_size |= (*data++ << 16); - if (cp_size == 0) cp_size = 0x10000; - - copied_from_source += cp_size; - out += cp_size; - } else { - /* write literal into dst */ - added_literal += cmd; - out += cmd; - data += cmd; - } - } - - /* sanity check */ - if (data != top || out != dst_size) - return -1; - - /* delete size is what was _not_ copied from source. - * edit size is that and literal additions. - */ - *src_copied = copied_from_source; - *literal_added = added_literal; - return 0; -} diff --git a/count-delta.h b/count-delta.h deleted file mode 100644 index 7359629827..0000000000 --- a/count-delta.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * Copyright (C) 2005 Junio C Hamano - */ -#ifndef COUNT_DELTA_H -#define COUNT_DELTA_H - -int count_delta(void *, unsigned long, - unsigned long *src_copied, unsigned long *literal_added); - -#endif From 4d0f39cecf93222401a81216e5cf2528e8abc6ae Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 4 Mar 2006 01:03:53 -0800 Subject: [PATCH 2/3] diffcore-break: similarity estimator fix. This is a companion patch to the previous fix to diffcore-rename. The merging-back process should use a logic similar to what is used there. Signed-off-by: Junio C Hamano --- diffcore-break.c | 47 +++++++++++++++++++++++++++-------------------- diffcore.h | 4 ++-- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/diffcore-break.c b/diffcore-break.c index 0fc2b860be..71ad58a25a 100644 --- a/diffcore-break.c +++ b/diffcore-break.c @@ -45,8 +45,8 @@ static int should_break(struct diff_filespec *src, * The value we return is 1 if we want the pair to be broken, * or 0 if we do not. */ - unsigned long delta_size, base_size, src_copied, literal_added; - int to_break = 0; + unsigned long delta_size, base_size, src_copied, literal_added, + src_removed; *merge_score_p = 0; /* assume no deletion --- "do not break" * is the default. @@ -72,33 +72,40 @@ static int should_break(struct diff_filespec *src, &src_copied, &literal_added)) return 0; + /* sanity */ + if (src->size < src_copied) + src_copied = src->size; + if (dst->size < literal_added + src_copied) { + if (src_copied < dst->size) + literal_added = dst->size - src_copied; + else + literal_added = 0; + } + src_removed = src->size - src_copied; + /* Compute merge-score, which is "how much is removed * from the source material". The clean-up stage will * merge the surviving pair together if the score is * less than the minimum, after rename/copy runs. */ - if (src->size <= src_copied) - ; /* all copied, nothing removed */ - else { - delta_size = src->size - src_copied; - *merge_score_p = delta_size * MAX_SCORE / src->size; - } - + *merge_score_p = src_removed * MAX_SCORE / src->size; + /* Extent of damage, which counts both inserts and * deletes. */ - if (src->size + literal_added <= src_copied) - delta_size = 0; /* avoid wrapping around */ - else - delta_size = (src->size - src_copied) + literal_added; - - /* We break if the edit exceeds the minimum. - * i.e. (break_score / MAX_SCORE < delta_size / base_size) - */ - if (break_score * base_size < delta_size * MAX_SCORE) - to_break = 1; + delta_size = src_removed + literal_added; + if (delta_size * MAX_SCORE / base_size < break_score) + return 0; - return to_break; + /* If you removed a lot without adding new material, that is + * not really a rewrite. + */ + if ((src->size * break_score < src_removed * MAX_SCORE) && + (literal_added * 20 < src_removed) && + (literal_added * 20 < src_copied)) + return 0; + + return 1; } void diffcore_break(int break_score) diff --git a/diffcore.h b/diffcore.h index dba4f17658..d31b3b476c 100644 --- a/diffcore.h +++ b/diffcore.h @@ -17,8 +17,8 @@ */ #define MAX_SCORE 60000.0 #define DEFAULT_RENAME_SCORE 30000 /* rename/copy similarity minimum (50%) */ -#define DEFAULT_BREAK_SCORE 30000 /* minimum for break to happen (50%)*/ -#define DEFAULT_MERGE_SCORE 48000 /* maximum for break-merge to happen (80%)*/ +#define DEFAULT_BREAK_SCORE 30000 /* minimum for break to happen (50%) */ +#define DEFAULT_MERGE_SCORE 36000 /* maximum for break-merge to happen 60%) */ #define MINIMUM_BREAK_SIZE 400 /* do not break a file smaller than this */ From ba23bbc8efff19f7ba4c272b6a06a4327a4cf039 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 4 Mar 2006 03:21:55 -0800 Subject: [PATCH 3/3] diffcore-delta: make change counter to byte oriented again. The textual line oriented change counter was fun but was not very effective. It tended to overcount the changes. This one changes it to a simple N-letter substring based implementation. Signed-off-by: Junio C Hamano --- diffcore-delta.c | 94 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 27 deletions(-) diff --git a/diffcore-delta.c b/diffcore-delta.c index 1e6a6911ec..70bacff837 100644 --- a/diffcore-delta.c +++ b/diffcore-delta.c @@ -1,32 +1,53 @@ #include "cache.h" #include "diff.h" #include "diffcore.h" -#include "delta.h" -#include "count-delta.h" -static int diffcore_count_changes_1(void *src, unsigned long src_size, - void *dst, unsigned long dst_size, - unsigned long delta_limit, - unsigned long *src_copied, - unsigned long *literal_added) +/* + * Idea here is very simple. + * + * We have total of (sz-N+1) N-byte overlapping sequences in buf whose + * size is sz. If the same N-byte sequence appears in both source and + * destination, we say the byte that starts that sequence is shared + * between them (i.e. copied from source to destination). + * + * For each possible N-byte sequence, if the source buffer has more + * instances of it than the destination buffer, that means the + * difference are the number of bytes not copied from source to + * destination. If the counts are the same, everything was copied + * from source to destination. If the destination has more, + * everything was copied, and destination added more. + * + * We are doing an approximation so we do not really have to waste + * memory by actually storing the sequence. We just hash them into + * somewhere around 2^16 hashbuckets and count the occurrences. + * + * The length of the sequence is arbitrarily set to 8 for now. + */ + +#define HASHBASE 65537 /* next_prime(2^16) */ + +static void hash_chars(unsigned char *buf, unsigned long sz, int *count) { - void *delta; - unsigned long delta_size; + unsigned int accum1, accum2, i; - delta = diff_delta(src, src_size, - dst, dst_size, - &delta_size, delta_limit); - if (!delta) - /* If delta_limit is exceeded, we have too much differences */ - return -1; - - /* Estimate the edit size by interpreting delta. */ - if (count_delta(delta, delta_size, src_copied, literal_added)) { - free(delta); - return -1; + /* an 8-byte shift register made of accum1 and accum2. New + * bytes come at LSB of accum2, and shifted up to accum1 + */ + for (i = accum1 = accum2 = 0; i < 7; i++, sz--) { + accum1 = (accum1 << 8) | (accum2 >> 24); + accum2 = (accum2 << 8) | *buf++; + } + while (sz) { + accum1 = (accum1 << 8) | (accum2 >> 24); + accum2 = (accum2 << 8) | *buf++; + /* We want something that hashes permuted byte + * sequences nicely; simpler hash like (accum1 ^ + * accum2) does not perform as well. + */ + i = (accum1 + accum2 * 0x61) % HASHBASE; + count[i]++; + sz--; } - free(delta); - return 0; } int diffcore_count_changes(void *src, unsigned long src_size, @@ -35,9 +56,28 @@ int diffcore_count_changes(void *src, unsigned long src_size, unsigned long *src_copied, unsigned long *literal_added) { - return diffcore_count_changes_1(src, src_size, - dst, dst_size, - delta_limit, - src_copied, - literal_added); + int *src_count, *dst_count, i; + unsigned long sc, la; + + if (src_size < 8 || dst_size < 8) + return -1; + + src_count = xcalloc(HASHBASE * 2, sizeof(int)); + dst_count = src_count + HASHBASE; + hash_chars(src, src_size, src_count); + hash_chars(dst, dst_size, dst_count); + + sc = la = 0; + for (i = 0; i < HASHBASE; i++) { + if (src_count[i] < dst_count[i]) { + la += dst_count[i] - src_count[i]; + sc += src_count[i]; + } + else /* i.e. if (dst_count[i] <= src_count[i]) */ + sc += dst_count[i]; + } + *src_copied = sc; + *literal_added = la; + free(src_count); + return 0; }