2006-09-18 01:02:52 +02:00
|
|
|
#include "cache.h"
|
2017-06-14 20:07:36 +02:00
|
|
|
#include "config.h"
|
2006-09-18 01:02:52 +02:00
|
|
|
#include "grep.h"
|
2009-07-02 00:07:24 +02:00
|
|
|
#include "userdiff.h"
|
2007-06-05 04:36:11 +02:00
|
|
|
#include "xdiff-interface.h"
|
2013-05-10 17:10:15 +02:00
|
|
|
#include "diff.h"
|
|
|
|
#include "diffcore.h"
|
2016-06-25 07:22:30 +02:00
|
|
|
#include "commit.h"
|
2016-06-25 07:22:31 +02:00
|
|
|
#include "quote.h"
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2012-09-15 23:04:36 +02:00
|
|
|
static int grep_source_load(struct grep_source *gs);
|
|
|
|
static int grep_source_is_binary(struct grep_source *gs);
|
|
|
|
|
2012-10-10 01:17:50 +02:00
|
|
|
static struct grep_opt grep_defaults;
|
|
|
|
|
2017-03-17 19:41:54 +01:00
|
|
|
static void std_output(struct grep_opt *opt, const void *buf, size_t size)
|
|
|
|
{
|
|
|
|
fwrite(buf, size, 1, stdout);
|
|
|
|
}
|
|
|
|
|
2012-10-10 01:17:50 +02:00
|
|
|
/*
|
|
|
|
* Initialize the grep_defaults template with hardcoded defaults.
|
|
|
|
* We could let the compiler do this, but without C99 initializers
|
|
|
|
* the code gets unwieldy and unreadable, so...
|
|
|
|
*/
|
|
|
|
void init_grep_defaults(void)
|
|
|
|
{
|
|
|
|
struct grep_opt *opt = &grep_defaults;
|
2012-10-10 01:40:03 +02:00
|
|
|
static int run_once;
|
|
|
|
|
|
|
|
if (run_once)
|
|
|
|
return;
|
|
|
|
run_once++;
|
2012-10-10 01:17:50 +02:00
|
|
|
|
|
|
|
memset(opt, 0, sizeof(*opt));
|
|
|
|
opt->relative = 1;
|
|
|
|
opt->pathname = 1;
|
|
|
|
opt->max_depth = -1;
|
|
|
|
opt->pattern_type_option = GREP_PATTERN_TYPE_UNSPECIFIED;
|
2015-09-24 23:08:21 +02:00
|
|
|
color_set(opt->color_context, "");
|
|
|
|
color_set(opt->color_filename, "");
|
|
|
|
color_set(opt->color_function, "");
|
|
|
|
color_set(opt->color_lineno, "");
|
|
|
|
color_set(opt->color_match_context, GIT_COLOR_BOLD_RED);
|
|
|
|
color_set(opt->color_match_selected, GIT_COLOR_BOLD_RED);
|
|
|
|
color_set(opt->color_selected, "");
|
|
|
|
color_set(opt->color_sep, GIT_COLOR_CYAN);
|
2012-10-10 01:17:50 +02:00
|
|
|
opt->color = -1;
|
2017-03-17 19:41:54 +01:00
|
|
|
opt->output = std_output;
|
2012-10-10 01:17:50 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static int parse_pattern_type_arg(const char *opt, const char *arg)
|
|
|
|
{
|
|
|
|
if (!strcmp(arg, "default"))
|
|
|
|
return GREP_PATTERN_TYPE_UNSPECIFIED;
|
|
|
|
else if (!strcmp(arg, "basic"))
|
|
|
|
return GREP_PATTERN_TYPE_BRE;
|
|
|
|
else if (!strcmp(arg, "extended"))
|
|
|
|
return GREP_PATTERN_TYPE_ERE;
|
|
|
|
else if (!strcmp(arg, "fixed"))
|
|
|
|
return GREP_PATTERN_TYPE_FIXED;
|
|
|
|
else if (!strcmp(arg, "perl"))
|
|
|
|
return GREP_PATTERN_TYPE_PCRE;
|
|
|
|
die("bad %s argument: %s", opt, arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read the configuration file once and store it in
|
|
|
|
* the grep_defaults template.
|
|
|
|
*/
|
|
|
|
int grep_config(const char *var, const char *value, void *cb)
|
|
|
|
{
|
|
|
|
struct grep_opt *opt = &grep_defaults;
|
|
|
|
char *color = NULL;
|
|
|
|
|
|
|
|
if (userdiff_config(var, value) < 0)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (!strcmp(var, "grep.extendedregexp")) {
|
2017-06-30 00:22:18 +02:00
|
|
|
opt->extended_regexp_option = git_config_bool(var, value);
|
2012-10-10 01:17:50 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcmp(var, "grep.patterntype")) {
|
|
|
|
opt->pattern_type_option = parse_pattern_type_arg(var, value);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!strcmp(var, "grep.linenumber")) {
|
|
|
|
opt->linenum = git_config_bool(var, value);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-03-17 20:16:05 +01:00
|
|
|
if (!strcmp(var, "grep.fullname")) {
|
|
|
|
opt->relative = !git_config_bool(var, value);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-10-10 01:17:50 +02:00
|
|
|
if (!strcmp(var, "color.grep"))
|
|
|
|
opt->color = git_config_colorbool(var, value);
|
|
|
|
else if (!strcmp(var, "color.grep.context"))
|
|
|
|
color = opt->color_context;
|
|
|
|
else if (!strcmp(var, "color.grep.filename"))
|
|
|
|
color = opt->color_filename;
|
|
|
|
else if (!strcmp(var, "color.grep.function"))
|
|
|
|
color = opt->color_function;
|
|
|
|
else if (!strcmp(var, "color.grep.linenumber"))
|
|
|
|
color = opt->color_lineno;
|
2014-10-27 19:23:05 +01:00
|
|
|
else if (!strcmp(var, "color.grep.matchcontext"))
|
|
|
|
color = opt->color_match_context;
|
|
|
|
else if (!strcmp(var, "color.grep.matchselected"))
|
|
|
|
color = opt->color_match_selected;
|
2012-10-10 01:17:50 +02:00
|
|
|
else if (!strcmp(var, "color.grep.selected"))
|
|
|
|
color = opt->color_selected;
|
|
|
|
else if (!strcmp(var, "color.grep.separator"))
|
|
|
|
color = opt->color_sep;
|
2014-10-27 19:23:05 +01:00
|
|
|
else if (!strcmp(var, "color.grep.match")) {
|
|
|
|
int rc = 0;
|
|
|
|
if (!value)
|
|
|
|
return config_error_nonbool(var);
|
2014-10-31 19:49:37 +01:00
|
|
|
rc |= color_parse(value, opt->color_match_context);
|
|
|
|
rc |= color_parse(value, opt->color_match_selected);
|
2014-10-27 19:23:05 +01:00
|
|
|
return rc;
|
|
|
|
}
|
2012-10-10 01:17:50 +02:00
|
|
|
|
|
|
|
if (color) {
|
|
|
|
if (!value)
|
|
|
|
return config_error_nonbool(var);
|
2014-10-07 21:33:09 +02:00
|
|
|
return color_parse(value, color);
|
2012-10-10 01:17:50 +02:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize one instance of grep_opt and copy the
|
|
|
|
* default values from the template we read the configuration
|
|
|
|
* information in an earlier call to git_config(grep_config).
|
|
|
|
*/
|
|
|
|
void grep_init(struct grep_opt *opt, const char *prefix)
|
|
|
|
{
|
|
|
|
struct grep_opt *def = &grep_defaults;
|
|
|
|
|
|
|
|
memset(opt, 0, sizeof(*opt));
|
|
|
|
opt->prefix = prefix;
|
|
|
|
opt->prefix_length = (prefix && *prefix) ? strlen(prefix) : 0;
|
|
|
|
opt->pattern_tail = &opt->pattern_list;
|
|
|
|
opt->header_tail = &opt->header_list;
|
|
|
|
|
|
|
|
opt->color = def->color;
|
|
|
|
opt->extended_regexp_option = def->extended_regexp_option;
|
|
|
|
opt->pattern_type_option = def->pattern_type_option;
|
|
|
|
opt->linenum = def->linenum;
|
|
|
|
opt->max_depth = def->max_depth;
|
|
|
|
opt->pathname = def->pathname;
|
|
|
|
opt->relative = def->relative;
|
2017-03-17 19:41:54 +01:00
|
|
|
opt->output = def->output;
|
2012-10-10 01:17:50 +02:00
|
|
|
|
2015-09-24 23:08:21 +02:00
|
|
|
color_set(opt->color_context, def->color_context);
|
|
|
|
color_set(opt->color_filename, def->color_filename);
|
|
|
|
color_set(opt->color_function, def->color_function);
|
|
|
|
color_set(opt->color_lineno, def->color_lineno);
|
|
|
|
color_set(opt->color_match_context, def->color_match_context);
|
|
|
|
color_set(opt->color_match_selected, def->color_match_selected);
|
|
|
|
color_set(opt->color_selected, def->color_selected);
|
|
|
|
color_set(opt->color_sep, def->color_sep);
|
2012-10-10 01:17:50 +02:00
|
|
|
}
|
2012-09-15 23:04:36 +02:00
|
|
|
|
grep: further simplify setting the pattern type
When c5c31d33 (grep: move pattern-type bits support to top-level
grep.[ch], 2012-10-03) introduced grep_commit_pattern_type() helper
function, the intention was to allow the users of grep API to having
to fiddle only with .pattern_type_option (which can be set to "fixed",
"basic", "extended", and "pcre"), and then immediately before compiling
the pattern strings for use, call grep_commit_pattern_type() to have
it prepare various bits in the grep_opt structure (like .fixed,
.regflags, etc.).
However, grep_set_pattern_type_option() helper function the grep API
internally uses were left as an external function by mistake. This
function shouldn't have been made callable by the users of the API.
Later when the grep API was used in revision traversal machinery,
the caller then mistakenly started calling the function around
34a4ae55 (log --grep: use the same helper to set -E/-F options as
"git grep", 2012-10-03), instead of setting the .pattern_type_option
field and letting the grep_commit_pattern_type() to take care of the
details.
This caused an unnecessary bug that made a configured
grep.patternType take precedence over the command line options
(e.g. --basic-regexp, --fixed-strings) in "git log" family of
commands.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-07-22 20:43:14 +02:00
|
|
|
static void grep_set_pattern_type_option(enum grep_pattern_type pattern_type, struct grep_opt *opt)
|
2012-10-03 23:47:48 +02:00
|
|
|
{
|
2017-06-30 00:22:21 +02:00
|
|
|
/*
|
|
|
|
* When committing to the pattern type by setting the relevant
|
|
|
|
* fields in grep_opt it's generally not necessary to zero out
|
|
|
|
* the fields we're not choosing, since they won't have been
|
|
|
|
* set by anything. The extended_regexp_option field is the
|
|
|
|
* only exception to this.
|
|
|
|
*
|
|
|
|
* This is because in the process of parsing grep.patternType
|
|
|
|
* & grep.extendedRegexp we set opt->pattern_type_option and
|
|
|
|
* opt->extended_regexp_option, respectively. We then
|
|
|
|
* internally use opt->extended_regexp_option to see if we're
|
|
|
|
* compiling an ERE. It must be unset if that's not actually
|
|
|
|
* the case.
|
|
|
|
*/
|
|
|
|
if (pattern_type != GREP_PATTERN_TYPE_ERE &&
|
|
|
|
opt->extended_regexp_option)
|
|
|
|
opt->extended_regexp_option = 0;
|
|
|
|
|
2012-10-03 23:47:48 +02:00
|
|
|
switch (pattern_type) {
|
|
|
|
case GREP_PATTERN_TYPE_UNSPECIFIED:
|
|
|
|
/* fall through */
|
|
|
|
|
|
|
|
case GREP_PATTERN_TYPE_BRE:
|
|
|
|
break;
|
|
|
|
|
|
|
|
case GREP_PATTERN_TYPE_ERE:
|
2017-06-30 00:22:21 +02:00
|
|
|
opt->extended_regexp_option = 1;
|
2012-10-03 23:47:48 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case GREP_PATTERN_TYPE_FIXED:
|
|
|
|
opt->fixed = 1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case GREP_PATTERN_TYPE_PCRE:
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
#ifdef USE_LIBPCRE2
|
|
|
|
opt->pcre2 = 1;
|
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* It's important that pcre1 always be assigned to
|
|
|
|
* even when there's no USE_LIBPCRE* defined. We still
|
|
|
|
* call the PCRE stub function, it just dies with
|
|
|
|
* "cannot use Perl-compatible regexes[...]".
|
|
|
|
*/
|
2017-05-25 21:45:29 +02:00
|
|
|
opt->pcre1 = 1;
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
#endif
|
2012-10-03 23:47:48 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
grep: further simplify setting the pattern type
When c5c31d33 (grep: move pattern-type bits support to top-level
grep.[ch], 2012-10-03) introduced grep_commit_pattern_type() helper
function, the intention was to allow the users of grep API to having
to fiddle only with .pattern_type_option (which can be set to "fixed",
"basic", "extended", and "pcre"), and then immediately before compiling
the pattern strings for use, call grep_commit_pattern_type() to have
it prepare various bits in the grep_opt structure (like .fixed,
.regflags, etc.).
However, grep_set_pattern_type_option() helper function the grep API
internally uses were left as an external function by mistake. This
function shouldn't have been made callable by the users of the API.
Later when the grep API was used in revision traversal machinery,
the caller then mistakenly started calling the function around
34a4ae55 (log --grep: use the same helper to set -E/-F options as
"git grep", 2012-10-03), instead of setting the .pattern_type_option
field and letting the grep_commit_pattern_type() to take care of the
details.
This caused an unnecessary bug that made a configured
grep.patternType take precedence over the command line options
(e.g. --basic-regexp, --fixed-strings) in "git log" family of
commands.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-07-22 20:43:14 +02:00
|
|
|
void grep_commit_pattern_type(enum grep_pattern_type pattern_type, struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
if (pattern_type != GREP_PATTERN_TYPE_UNSPECIFIED)
|
|
|
|
grep_set_pattern_type_option(pattern_type, opt);
|
|
|
|
else if (opt->pattern_type_option != GREP_PATTERN_TYPE_UNSPECIFIED)
|
|
|
|
grep_set_pattern_type_option(opt->pattern_type_option, opt);
|
|
|
|
else if (opt->extended_regexp_option)
|
2017-06-30 00:22:21 +02:00
|
|
|
/*
|
|
|
|
* This branch *must* happen after setting from the
|
|
|
|
* opt->pattern_type_option above, we don't want
|
|
|
|
* grep.extendedRegexp to override grep.patternType!
|
|
|
|
*/
|
grep: further simplify setting the pattern type
When c5c31d33 (grep: move pattern-type bits support to top-level
grep.[ch], 2012-10-03) introduced grep_commit_pattern_type() helper
function, the intention was to allow the users of grep API to having
to fiddle only with .pattern_type_option (which can be set to "fixed",
"basic", "extended", and "pcre"), and then immediately before compiling
the pattern strings for use, call grep_commit_pattern_type() to have
it prepare various bits in the grep_opt structure (like .fixed,
.regflags, etc.).
However, grep_set_pattern_type_option() helper function the grep API
internally uses were left as an external function by mistake. This
function shouldn't have been made callable by the users of the API.
Later when the grep API was used in revision traversal machinery,
the caller then mistakenly started calling the function around
34a4ae55 (log --grep: use the same helper to set -E/-F options as
"git grep", 2012-10-03), instead of setting the .pattern_type_option
field and letting the grep_commit_pattern_type() to take care of the
details.
This caused an unnecessary bug that made a configured
grep.patternType take precedence over the command line options
(e.g. --basic-regexp, --fixed-strings) in "git log" family of
commands.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-07-22 20:43:14 +02:00
|
|
|
grep_set_pattern_type_option(GREP_PATTERN_TYPE_ERE, opt);
|
|
|
|
}
|
|
|
|
|
2012-05-20 16:32:39 +02:00
|
|
|
static struct grep_pat *create_grep_pat(const char *pat, size_t patlen,
|
|
|
|
const char *origin, int no,
|
|
|
|
enum grep_pat_token t,
|
|
|
|
enum grep_header_field field)
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
{
|
|
|
|
struct grep_pat *p = xcalloc(1, sizeof(*p));
|
2012-05-20 16:33:07 +02:00
|
|
|
p->pattern = xmemdupz(pat, patlen);
|
2012-05-20 16:32:39 +02:00
|
|
|
p->patternlen = patlen;
|
|
|
|
p->origin = origin;
|
|
|
|
p->no = no;
|
|
|
|
p->token = t;
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
p->field = field;
|
2012-05-20 16:32:39 +02:00
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
2012-05-20 16:32:54 +02:00
|
|
|
static void do_append_grep_pat(struct grep_pat ***tail, struct grep_pat *p)
|
|
|
|
{
|
|
|
|
**tail = p;
|
|
|
|
*tail = &p->next;
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
p->next = NULL;
|
2012-05-20 16:33:07 +02:00
|
|
|
|
|
|
|
switch (p->token) {
|
|
|
|
case GREP_PATTERN: /* atom */
|
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
case GREP_PATTERN_BODY:
|
|
|
|
for (;;) {
|
|
|
|
struct grep_pat *new_pat;
|
|
|
|
size_t len = 0;
|
|
|
|
char *cp = p->pattern + p->patternlen, *nl = NULL;
|
|
|
|
while (++len <= p->patternlen) {
|
|
|
|
if (*(--cp) == '\n') {
|
|
|
|
nl = cp;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!nl)
|
|
|
|
break;
|
|
|
|
new_pat = create_grep_pat(nl + 1, len - 1, p->origin,
|
|
|
|
p->no, p->token, p->field);
|
|
|
|
new_pat->next = p->next;
|
|
|
|
if (!p->next)
|
|
|
|
*tail = &new_pat->next;
|
|
|
|
p->next = new_pat;
|
|
|
|
*nl = '\0';
|
|
|
|
p->patternlen -= len;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2012-05-20 16:32:54 +02:00
|
|
|
}
|
|
|
|
|
2012-05-20 16:32:39 +02:00
|
|
|
void append_header_grep_pattern(struct grep_opt *opt,
|
|
|
|
enum grep_header_field field, const char *pat)
|
|
|
|
{
|
|
|
|
struct grep_pat *p = create_grep_pat(pat, strlen(pat), "header", 0,
|
|
|
|
GREP_PATTERN_HEAD, field);
|
2012-09-29 20:59:52 +02:00
|
|
|
if (field == GREP_HEADER_REFLOG)
|
|
|
|
opt->use_reflog_filter = 1;
|
2012-05-20 16:32:54 +02:00
|
|
|
do_append_grep_pat(&opt->header_tail, p);
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
}
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
void append_grep_pattern(struct grep_opt *opt, const char *pat,
|
|
|
|
const char *origin, int no, enum grep_pat_token t)
|
2010-05-22 23:43:43 +02:00
|
|
|
{
|
|
|
|
append_grep_pat(opt, pat, strlen(pat), origin, no, t);
|
|
|
|
}
|
|
|
|
|
|
|
|
void append_grep_pat(struct grep_opt *opt, const char *pat, size_t patlen,
|
|
|
|
const char *origin, int no, enum grep_pat_token t)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
2012-05-20 16:32:39 +02:00
|
|
|
struct grep_pat *p = create_grep_pat(pat, patlen, origin, no, t, 0);
|
2012-05-20 16:32:54 +02:00
|
|
|
do_append_grep_pat(&opt->pattern_tail, p);
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2010-01-25 23:51:39 +01:00
|
|
|
struct grep_opt *grep_opt_dup(const struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
struct grep_pat *pat;
|
|
|
|
struct grep_opt *ret = xmalloc(sizeof(struct grep_opt));
|
|
|
|
*ret = *opt;
|
|
|
|
|
|
|
|
ret->pattern_list = NULL;
|
|
|
|
ret->pattern_tail = &ret->pattern_list;
|
|
|
|
|
|
|
|
for(pat = opt->pattern_list; pat != NULL; pat = pat->next)
|
|
|
|
{
|
|
|
|
if(pat->token == GREP_PATTERN_HEAD)
|
|
|
|
append_header_grep_pattern(ret, pat->field,
|
|
|
|
pat->pattern);
|
|
|
|
else
|
2010-05-22 23:43:43 +02:00
|
|
|
append_grep_pat(ret, pat->pattern, pat->patternlen,
|
|
|
|
pat->origin, pat->no, pat->token);
|
2010-01-25 23:51:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-05-09 23:52:04 +02:00
|
|
|
static NORETURN void compile_regexp_failed(const struct grep_pat *p,
|
|
|
|
const char *error)
|
|
|
|
{
|
|
|
|
char where[1024];
|
|
|
|
|
|
|
|
if (p->no)
|
2015-09-24 23:06:51 +02:00
|
|
|
xsnprintf(where, sizeof(where), "In '%s' at %d, ", p->origin, p->no);
|
2011-05-09 23:52:04 +02:00
|
|
|
else if (p->origin)
|
2015-09-24 23:06:51 +02:00
|
|
|
xsnprintf(where, sizeof(where), "%s, ", p->origin);
|
2011-05-09 23:52:04 +02:00
|
|
|
else
|
|
|
|
where[0] = 0;
|
|
|
|
|
|
|
|
die("%s'%s': %s", where, p->pattern, error);
|
|
|
|
}
|
|
|
|
|
2017-05-25 21:45:30 +02:00
|
|
|
static int is_fixed(const char *s, size_t len)
|
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for (i = 0; i < len; i++) {
|
|
|
|
if (is_regex_special(s[i]))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2017-05-25 21:45:27 +02:00
|
|
|
static int has_null(const char *s, size_t len)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* regcomp cannot accept patterns with NULs so when using it
|
|
|
|
* we consider any pattern containing a NUL fixed.
|
|
|
|
*/
|
|
|
|
if (memchr(s, 0, len))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-05-25 21:45:28 +02:00
|
|
|
#ifdef USE_LIBPCRE1
|
2017-05-25 21:45:29 +02:00
|
|
|
static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt)
|
2011-05-09 23:52:05 +02:00
|
|
|
{
|
|
|
|
const char *error;
|
|
|
|
int erroffset;
|
2012-02-25 10:24:28 +01:00
|
|
|
int options = PCRE_MULTILINE;
|
2011-05-09 23:52:05 +02:00
|
|
|
|
2016-06-25 07:22:33 +02:00
|
|
|
if (opt->ignore_case) {
|
|
|
|
if (has_non_ascii(p->pattern))
|
2017-05-25 21:45:29 +02:00
|
|
|
p->pcre1_tables = pcre_maketables();
|
2011-05-09 23:52:05 +02:00
|
|
|
options |= PCRE_CASELESS;
|
2016-06-25 07:22:33 +02:00
|
|
|
}
|
2016-06-25 07:22:35 +02:00
|
|
|
if (is_utf8_locale() && has_non_ascii(p->pattern))
|
|
|
|
options |= PCRE_UTF8;
|
2011-05-09 23:52:05 +02:00
|
|
|
|
2017-05-25 21:45:29 +02:00
|
|
|
p->pcre1_regexp = pcre_compile(p->pattern, options, &error, &erroffset,
|
|
|
|
p->pcre1_tables);
|
|
|
|
if (!p->pcre1_regexp)
|
2011-05-09 23:52:05 +02:00
|
|
|
compile_regexp_failed(p, error);
|
|
|
|
|
grep: add support for the PCRE v1 JIT API
Change the grep PCRE v1 code to use JIT when available. When PCRE
support was initially added in commit 63e7e9d8b6 ("git-grep: Learn
PCRE", 2011-05-09) PCRE had no JIT support, it was integrated into
8.20 released on 2011-10-21.
Enabling JIT support usually improves performance by more than
40%. The pattern compilation times are relatively slower, but those
relative numbers are tiny, and are easily made back in all but the
most trivial cases of grep. Detailed benchmarks & overview of
compilation times is at: http://sljit.sourceforge.net/pcre.html
With this change the difference in a t/perf/p7820-grep-engines.sh run
is, with just the /perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_OPTS='-j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~ HEAD p7820-grep-engines.sh
Test HEAD~ HEAD
---------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.35(1.11+0.43) 0.23(0.42+0.46) -34.3%
7820.7: perl grep '^how to' 0.64(2.71+0.36) 0.27(0.66+0.44) -57.8%
7820.11: perl grep '[how] to' 0.63(2.51+0.42) 0.33(0.98+0.39) -47.6%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.17(5.61+0.35) 0.34(1.08+0.46) -70.9%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.43(1.52+0.44) 0.30(0.88+0.42) -30.2%
The conditional support for JIT is implemented as suggested in the
pcrejit(3) man page. E.g. defining PCRE_STUDY_JIT_COMPILE to 0 if it's
not present.
The implementation is relatively verbose because even if
PCRE_CONFIG_JIT is defined only a call to pcre_config() can determine
if the JIT is available, and if so the faster pcre_jit_exec() function
should be called instead of pcre_exec(), and a different (but not
complimentary!) function needs to be called to free pcre1_extra_info.
There's no graceful fallback if pcre_jit_stack_alloc() fails under
PCRE_CONFIG_JIT, instead the program will simply abort. I don't think
this is worth handling gracefully, it'll only fail in cases where
malloc() doesn't work, in which case we're screwed anyway.
That there's no assignment of `p->pcre1_jit_on = 0` when
PCRE_CONFIG_JIT isn't defined isn't a bug. The create_grep_pat()
function allocates the grep_pat allocates it with calloc(), so it's
guaranteed to be 0 when PCRE_CONFIG_JIT isn't defined.
I you're bisecting and find this change, check that your PCRE isn't
older than 8.32. This change intentionally broke really old versions
of PCRE, but that's fixed in follow-up commits.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-25 22:05:25 +02:00
|
|
|
p->pcre1_extra_info = pcre_study(p->pcre1_regexp, PCRE_STUDY_JIT_COMPILE, &error);
|
2017-05-25 21:45:29 +02:00
|
|
|
if (!p->pcre1_extra_info && error)
|
2011-05-09 23:52:05 +02:00
|
|
|
die("%s", error);
|
grep: add support for the PCRE v1 JIT API
Change the grep PCRE v1 code to use JIT when available. When PCRE
support was initially added in commit 63e7e9d8b6 ("git-grep: Learn
PCRE", 2011-05-09) PCRE had no JIT support, it was integrated into
8.20 released on 2011-10-21.
Enabling JIT support usually improves performance by more than
40%. The pattern compilation times are relatively slower, but those
relative numbers are tiny, and are easily made back in all but the
most trivial cases of grep. Detailed benchmarks & overview of
compilation times is at: http://sljit.sourceforge.net/pcre.html
With this change the difference in a t/perf/p7820-grep-engines.sh run
is, with just the /perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_OPTS='-j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~ HEAD p7820-grep-engines.sh
Test HEAD~ HEAD
---------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.35(1.11+0.43) 0.23(0.42+0.46) -34.3%
7820.7: perl grep '^how to' 0.64(2.71+0.36) 0.27(0.66+0.44) -57.8%
7820.11: perl grep '[how] to' 0.63(2.51+0.42) 0.33(0.98+0.39) -47.6%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.17(5.61+0.35) 0.34(1.08+0.46) -70.9%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.43(1.52+0.44) 0.30(0.88+0.42) -30.2%
The conditional support for JIT is implemented as suggested in the
pcrejit(3) man page. E.g. defining PCRE_STUDY_JIT_COMPILE to 0 if it's
not present.
The implementation is relatively verbose because even if
PCRE_CONFIG_JIT is defined only a call to pcre_config() can determine
if the JIT is available, and if so the faster pcre_jit_exec() function
should be called instead of pcre_exec(), and a different (but not
complimentary!) function needs to be called to free pcre1_extra_info.
There's no graceful fallback if pcre_jit_stack_alloc() fails under
PCRE_CONFIG_JIT, instead the program will simply abort. I don't think
this is worth handling gracefully, it'll only fail in cases where
malloc() doesn't work, in which case we're screwed anyway.
That there's no assignment of `p->pcre1_jit_on = 0` when
PCRE_CONFIG_JIT isn't defined isn't a bug. The create_grep_pat()
function allocates the grep_pat allocates it with calloc(), so it's
guaranteed to be 0 when PCRE_CONFIG_JIT isn't defined.
I you're bisecting and find this change, check that your PCRE isn't
older than 8.32. This change intentionally broke really old versions
of PCRE, but that's fixed in follow-up commits.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-25 22:05:25 +02:00
|
|
|
|
2017-05-25 22:05:26 +02:00
|
|
|
#ifdef GIT_PCRE1_USE_JIT
|
grep: add support for the PCRE v1 JIT API
Change the grep PCRE v1 code to use JIT when available. When PCRE
support was initially added in commit 63e7e9d8b6 ("git-grep: Learn
PCRE", 2011-05-09) PCRE had no JIT support, it was integrated into
8.20 released on 2011-10-21.
Enabling JIT support usually improves performance by more than
40%. The pattern compilation times are relatively slower, but those
relative numbers are tiny, and are easily made back in all but the
most trivial cases of grep. Detailed benchmarks & overview of
compilation times is at: http://sljit.sourceforge.net/pcre.html
With this change the difference in a t/perf/p7820-grep-engines.sh run
is, with just the /perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_OPTS='-j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~ HEAD p7820-grep-engines.sh
Test HEAD~ HEAD
---------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.35(1.11+0.43) 0.23(0.42+0.46) -34.3%
7820.7: perl grep '^how to' 0.64(2.71+0.36) 0.27(0.66+0.44) -57.8%
7820.11: perl grep '[how] to' 0.63(2.51+0.42) 0.33(0.98+0.39) -47.6%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.17(5.61+0.35) 0.34(1.08+0.46) -70.9%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.43(1.52+0.44) 0.30(0.88+0.42) -30.2%
The conditional support for JIT is implemented as suggested in the
pcrejit(3) man page. E.g. defining PCRE_STUDY_JIT_COMPILE to 0 if it's
not present.
The implementation is relatively verbose because even if
PCRE_CONFIG_JIT is defined only a call to pcre_config() can determine
if the JIT is available, and if so the faster pcre_jit_exec() function
should be called instead of pcre_exec(), and a different (but not
complimentary!) function needs to be called to free pcre1_extra_info.
There's no graceful fallback if pcre_jit_stack_alloc() fails under
PCRE_CONFIG_JIT, instead the program will simply abort. I don't think
this is worth handling gracefully, it'll only fail in cases where
malloc() doesn't work, in which case we're screwed anyway.
That there's no assignment of `p->pcre1_jit_on = 0` when
PCRE_CONFIG_JIT isn't defined isn't a bug. The create_grep_pat()
function allocates the grep_pat allocates it with calloc(), so it's
guaranteed to be 0 when PCRE_CONFIG_JIT isn't defined.
I you're bisecting and find this change, check that your PCRE isn't
older than 8.32. This change intentionally broke really old versions
of PCRE, but that's fixed in follow-up commits.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-25 22:05:25 +02:00
|
|
|
pcre_config(PCRE_CONFIG_JIT, &p->pcre1_jit_on);
|
|
|
|
if (p->pcre1_jit_on == 1) {
|
|
|
|
p->pcre1_jit_stack = pcre_jit_stack_alloc(1, 1024 * 1024);
|
|
|
|
if (!p->pcre1_jit_stack)
|
|
|
|
die("Couldn't allocate PCRE JIT stack");
|
|
|
|
pcre_assign_jit_stack(p->pcre1_extra_info, NULL, p->pcre1_jit_stack);
|
|
|
|
} else if (p->pcre1_jit_on != 0) {
|
|
|
|
die("BUG: The pcre1_jit_on variable should be 0 or 1, not %d",
|
|
|
|
p->pcre1_jit_on);
|
|
|
|
}
|
|
|
|
#endif
|
2011-05-09 23:52:05 +02:00
|
|
|
}
|
|
|
|
|
2017-05-25 21:45:29 +02:00
|
|
|
static int pcre1match(struct grep_pat *p, const char *line, const char *eol,
|
2011-05-09 23:52:05 +02:00
|
|
|
regmatch_t *match, int eflags)
|
|
|
|
{
|
|
|
|
int ovector[30], ret, flags = 0;
|
|
|
|
|
|
|
|
if (eflags & REG_NOTBOL)
|
|
|
|
flags |= PCRE_NOTBOL;
|
|
|
|
|
2017-05-25 22:05:26 +02:00
|
|
|
#ifdef GIT_PCRE1_USE_JIT
|
grep: add support for the PCRE v1 JIT API
Change the grep PCRE v1 code to use JIT when available. When PCRE
support was initially added in commit 63e7e9d8b6 ("git-grep: Learn
PCRE", 2011-05-09) PCRE had no JIT support, it was integrated into
8.20 released on 2011-10-21.
Enabling JIT support usually improves performance by more than
40%. The pattern compilation times are relatively slower, but those
relative numbers are tiny, and are easily made back in all but the
most trivial cases of grep. Detailed benchmarks & overview of
compilation times is at: http://sljit.sourceforge.net/pcre.html
With this change the difference in a t/perf/p7820-grep-engines.sh run
is, with just the /perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_OPTS='-j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~ HEAD p7820-grep-engines.sh
Test HEAD~ HEAD
---------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.35(1.11+0.43) 0.23(0.42+0.46) -34.3%
7820.7: perl grep '^how to' 0.64(2.71+0.36) 0.27(0.66+0.44) -57.8%
7820.11: perl grep '[how] to' 0.63(2.51+0.42) 0.33(0.98+0.39) -47.6%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.17(5.61+0.35) 0.34(1.08+0.46) -70.9%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.43(1.52+0.44) 0.30(0.88+0.42) -30.2%
The conditional support for JIT is implemented as suggested in the
pcrejit(3) man page. E.g. defining PCRE_STUDY_JIT_COMPILE to 0 if it's
not present.
The implementation is relatively verbose because even if
PCRE_CONFIG_JIT is defined only a call to pcre_config() can determine
if the JIT is available, and if so the faster pcre_jit_exec() function
should be called instead of pcre_exec(), and a different (but not
complimentary!) function needs to be called to free pcre1_extra_info.
There's no graceful fallback if pcre_jit_stack_alloc() fails under
PCRE_CONFIG_JIT, instead the program will simply abort. I don't think
this is worth handling gracefully, it'll only fail in cases where
malloc() doesn't work, in which case we're screwed anyway.
That there's no assignment of `p->pcre1_jit_on = 0` when
PCRE_CONFIG_JIT isn't defined isn't a bug. The create_grep_pat()
function allocates the grep_pat allocates it with calloc(), so it's
guaranteed to be 0 when PCRE_CONFIG_JIT isn't defined.
I you're bisecting and find this change, check that your PCRE isn't
older than 8.32. This change intentionally broke really old versions
of PCRE, but that's fixed in follow-up commits.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-25 22:05:25 +02:00
|
|
|
if (p->pcre1_jit_on) {
|
|
|
|
ret = pcre_jit_exec(p->pcre1_regexp, p->pcre1_extra_info, line,
|
|
|
|
eol - line, 0, flags, ovector,
|
|
|
|
ARRAY_SIZE(ovector), p->pcre1_jit_stack);
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
ret = pcre_exec(p->pcre1_regexp, p->pcre1_extra_info, line,
|
|
|
|
eol - line, 0, flags, ovector,
|
|
|
|
ARRAY_SIZE(ovector));
|
|
|
|
}
|
|
|
|
|
2011-05-09 23:52:05 +02:00
|
|
|
if (ret < 0 && ret != PCRE_ERROR_NOMATCH)
|
|
|
|
die("pcre_exec failed with error code %d", ret);
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
match->rm_so = ovector[0];
|
|
|
|
match->rm_eo = ovector[1];
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-05-25 21:45:29 +02:00
|
|
|
static void free_pcre1_regexp(struct grep_pat *p)
|
2011-05-09 23:52:05 +02:00
|
|
|
{
|
2017-05-25 21:45:29 +02:00
|
|
|
pcre_free(p->pcre1_regexp);
|
2017-05-25 22:05:26 +02:00
|
|
|
#ifdef GIT_PCRE1_USE_JIT
|
grep: add support for the PCRE v1 JIT API
Change the grep PCRE v1 code to use JIT when available. When PCRE
support was initially added in commit 63e7e9d8b6 ("git-grep: Learn
PCRE", 2011-05-09) PCRE had no JIT support, it was integrated into
8.20 released on 2011-10-21.
Enabling JIT support usually improves performance by more than
40%. The pattern compilation times are relatively slower, but those
relative numbers are tiny, and are easily made back in all but the
most trivial cases of grep. Detailed benchmarks & overview of
compilation times is at: http://sljit.sourceforge.net/pcre.html
With this change the difference in a t/perf/p7820-grep-engines.sh run
is, with just the /perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_OPTS='-j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~ HEAD p7820-grep-engines.sh
Test HEAD~ HEAD
---------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.35(1.11+0.43) 0.23(0.42+0.46) -34.3%
7820.7: perl grep '^how to' 0.64(2.71+0.36) 0.27(0.66+0.44) -57.8%
7820.11: perl grep '[how] to' 0.63(2.51+0.42) 0.33(0.98+0.39) -47.6%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.17(5.61+0.35) 0.34(1.08+0.46) -70.9%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.43(1.52+0.44) 0.30(0.88+0.42) -30.2%
The conditional support for JIT is implemented as suggested in the
pcrejit(3) man page. E.g. defining PCRE_STUDY_JIT_COMPILE to 0 if it's
not present.
The implementation is relatively verbose because even if
PCRE_CONFIG_JIT is defined only a call to pcre_config() can determine
if the JIT is available, and if so the faster pcre_jit_exec() function
should be called instead of pcre_exec(), and a different (but not
complimentary!) function needs to be called to free pcre1_extra_info.
There's no graceful fallback if pcre_jit_stack_alloc() fails under
PCRE_CONFIG_JIT, instead the program will simply abort. I don't think
this is worth handling gracefully, it'll only fail in cases where
malloc() doesn't work, in which case we're screwed anyway.
That there's no assignment of `p->pcre1_jit_on = 0` when
PCRE_CONFIG_JIT isn't defined isn't a bug. The create_grep_pat()
function allocates the grep_pat allocates it with calloc(), so it's
guaranteed to be 0 when PCRE_CONFIG_JIT isn't defined.
I you're bisecting and find this change, check that your PCRE isn't
older than 8.32. This change intentionally broke really old versions
of PCRE, but that's fixed in follow-up commits.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-25 22:05:25 +02:00
|
|
|
if (p->pcre1_jit_on) {
|
|
|
|
pcre_free_study(p->pcre1_extra_info);
|
|
|
|
pcre_jit_stack_free(p->pcre1_jit_stack);
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
pcre_free(p->pcre1_extra_info);
|
|
|
|
}
|
2017-05-25 21:45:29 +02:00
|
|
|
pcre_free((void *)p->pcre1_tables);
|
2011-05-09 23:52:05 +02:00
|
|
|
}
|
2017-05-25 21:45:28 +02:00
|
|
|
#else /* !USE_LIBPCRE1 */
|
2017-05-25 21:45:29 +02:00
|
|
|
static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt)
|
2011-05-09 23:52:05 +02:00
|
|
|
{
|
|
|
|
die("cannot use Perl-compatible regexes when not compiled with USE_LIBPCRE");
|
|
|
|
}
|
|
|
|
|
2017-05-25 21:45:29 +02:00
|
|
|
static int pcre1match(struct grep_pat *p, const char *line, const char *eol,
|
2011-05-09 23:52:05 +02:00
|
|
|
regmatch_t *match, int eflags)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2017-05-25 21:45:29 +02:00
|
|
|
static void free_pcre1_regexp(struct grep_pat *p)
|
2011-05-09 23:52:05 +02:00
|
|
|
{
|
|
|
|
}
|
2017-05-25 21:45:28 +02:00
|
|
|
#endif /* !USE_LIBPCRE1 */
|
2011-05-09 23:52:05 +02:00
|
|
|
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
#ifdef USE_LIBPCRE2
|
|
|
|
static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
PCRE2_UCHAR errbuf[256];
|
|
|
|
PCRE2_SIZE erroffset;
|
|
|
|
int options = PCRE2_MULTILINE;
|
|
|
|
const uint8_t *character_tables = NULL;
|
|
|
|
int jitret;
|
|
|
|
|
|
|
|
assert(opt->pcre2);
|
|
|
|
|
|
|
|
p->pcre2_compile_context = NULL;
|
|
|
|
|
|
|
|
if (opt->ignore_case) {
|
|
|
|
if (has_non_ascii(p->pattern)) {
|
|
|
|
character_tables = pcre2_maketables(NULL);
|
|
|
|
p->pcre2_compile_context = pcre2_compile_context_create(NULL);
|
|
|
|
pcre2_set_character_tables(p->pcre2_compile_context, character_tables);
|
|
|
|
}
|
|
|
|
options |= PCRE2_CASELESS;
|
|
|
|
}
|
|
|
|
if (is_utf8_locale() && has_non_ascii(p->pattern))
|
|
|
|
options |= PCRE2_UTF;
|
|
|
|
|
|
|
|
p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern,
|
|
|
|
p->patternlen, options, &error, &erroffset,
|
|
|
|
p->pcre2_compile_context);
|
|
|
|
|
|
|
|
if (p->pcre2_pattern) {
|
|
|
|
p->pcre2_match_data = pcre2_match_data_create_from_pattern(p->pcre2_pattern, NULL);
|
|
|
|
if (!p->pcre2_match_data)
|
|
|
|
die("Couldn't allocate PCRE2 match data");
|
|
|
|
} else {
|
|
|
|
pcre2_get_error_message(error, errbuf, sizeof(errbuf));
|
|
|
|
compile_regexp_failed(p, (const char *)&errbuf);
|
|
|
|
}
|
|
|
|
|
|
|
|
pcre2_config(PCRE2_CONFIG_JIT, &p->pcre2_jit_on);
|
|
|
|
if (p->pcre2_jit_on == 1) {
|
|
|
|
jitret = pcre2_jit_compile(p->pcre2_pattern, PCRE2_JIT_COMPLETE);
|
|
|
|
if (jitret)
|
|
|
|
die("Couldn't JIT the PCRE2 pattern '%s', got '%d'\n", p->pattern, jitret);
|
|
|
|
p->pcre2_jit_stack = pcre2_jit_stack_create(1, 1024 * 1024, NULL);
|
|
|
|
if (!p->pcre2_jit_stack)
|
|
|
|
die("Couldn't allocate PCRE2 JIT stack");
|
|
|
|
p->pcre2_match_context = pcre2_match_context_create(NULL);
|
2017-06-20 00:01:48 +02:00
|
|
|
if (!p->pcre2_match_context)
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
die("Couldn't allocate PCRE2 match context");
|
|
|
|
pcre2_jit_stack_assign(p->pcre2_match_context, NULL, p->pcre2_jit_stack);
|
|
|
|
} else if (p->pcre2_jit_on != 0) {
|
|
|
|
die("BUG: The pcre2_jit_on variable should be 0 or 1, not %d",
|
|
|
|
p->pcre1_jit_on);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int pcre2match(struct grep_pat *p, const char *line, const char *eol,
|
|
|
|
regmatch_t *match, int eflags)
|
|
|
|
{
|
|
|
|
int ret, flags = 0;
|
|
|
|
PCRE2_SIZE *ovector;
|
|
|
|
PCRE2_UCHAR errbuf[256];
|
|
|
|
|
|
|
|
if (eflags & REG_NOTBOL)
|
|
|
|
flags |= PCRE2_NOTBOL;
|
|
|
|
|
|
|
|
if (p->pcre2_jit_on)
|
|
|
|
ret = pcre2_jit_match(p->pcre2_pattern, (unsigned char *)line,
|
|
|
|
eol - line, 0, flags, p->pcre2_match_data,
|
|
|
|
NULL);
|
|
|
|
else
|
|
|
|
ret = pcre2_match(p->pcre2_pattern, (unsigned char *)line,
|
|
|
|
eol - line, 0, flags, p->pcre2_match_data,
|
|
|
|
NULL);
|
|
|
|
|
|
|
|
if (ret < 0 && ret != PCRE2_ERROR_NOMATCH) {
|
|
|
|
pcre2_get_error_message(ret, errbuf, sizeof(errbuf));
|
|
|
|
die("%s failed with error code %d: %s",
|
|
|
|
(p->pcre2_jit_on ? "pcre2_jit_match" : "pcre2_match"), ret,
|
|
|
|
errbuf);
|
|
|
|
}
|
|
|
|
if (ret > 0) {
|
|
|
|
ovector = pcre2_get_ovector_pointer(p->pcre2_match_data);
|
|
|
|
ret = 0;
|
|
|
|
match->rm_so = (int)ovector[0];
|
|
|
|
match->rm_eo = (int)ovector[1];
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_pcre2_pattern(struct grep_pat *p)
|
|
|
|
{
|
|
|
|
pcre2_compile_context_free(p->pcre2_compile_context);
|
|
|
|
pcre2_code_free(p->pcre2_pattern);
|
|
|
|
pcre2_match_data_free(p->pcre2_match_data);
|
|
|
|
pcre2_jit_stack_free(p->pcre2_jit_stack);
|
|
|
|
pcre2_match_context_free(p->pcre2_match_context);
|
|
|
|
}
|
|
|
|
#else /* !USE_LIBPCRE2 */
|
|
|
|
static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Unreachable until USE_LIBPCRE2 becomes synonymous with
|
|
|
|
* USE_LIBPCRE. See the sibling comment in
|
|
|
|
* grep_set_pattern_type_option().
|
|
|
|
*/
|
|
|
|
die("cannot use Perl-compatible regexes when not compiled with USE_LIBPCRE");
|
|
|
|
}
|
|
|
|
|
|
|
|
static int pcre2match(struct grep_pat *p, const char *line, const char *eol,
|
|
|
|
regmatch_t *match, int eflags)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_pcre2_pattern(struct grep_pat *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* !USE_LIBPCRE2 */
|
|
|
|
|
2016-06-25 07:22:31 +02:00
|
|
|
static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
struct strbuf sb = STRBUF_INIT;
|
|
|
|
int err;
|
2017-06-30 00:22:22 +02:00
|
|
|
int regflags = 0;
|
2016-06-25 07:22:31 +02:00
|
|
|
|
|
|
|
basic_regex_quote_buf(&sb, p->pattern);
|
|
|
|
if (opt->ignore_case)
|
|
|
|
regflags |= REG_ICASE;
|
|
|
|
err = regcomp(&p->regexp, sb.buf, regflags);
|
|
|
|
if (opt->debug)
|
|
|
|
fprintf(stderr, "fixed %s\n", sb.buf);
|
|
|
|
strbuf_release(&sb);
|
|
|
|
if (err) {
|
|
|
|
char errbuf[1024];
|
|
|
|
regerror(err, &p->regexp, errbuf, sizeof(errbuf));
|
|
|
|
regfree(&p->regexp);
|
|
|
|
compile_regexp_failed(p, errbuf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
|
|
|
|
{
|
2017-06-30 00:22:21 +02:00
|
|
|
int ascii_only;
|
2009-01-10 00:18:34 +01:00
|
|
|
int err;
|
2017-06-30 00:22:21 +02:00
|
|
|
int regflags = REG_NEWLINE;
|
2009-01-10 00:18:34 +01:00
|
|
|
|
2009-03-07 13:28:40 +01:00
|
|
|
p->word_regexp = opt->word_regexp;
|
2009-11-06 10:22:35 +01:00
|
|
|
p->ignore_case = opt->ignore_case;
|
2016-06-25 07:22:30 +02:00
|
|
|
ascii_only = !has_non_ascii(p->pattern);
|
2009-03-07 13:28:40 +01:00
|
|
|
|
2016-06-25 07:22:31 +02:00
|
|
|
/*
|
|
|
|
* Even when -F (fixed) asks us to do a non-regexp search, we
|
|
|
|
* may not be able to correctly case-fold when -i
|
|
|
|
* (ignore-case) is asked (in which case, we'll synthesize a
|
|
|
|
* regexp to match the pattern that matches regexp special
|
|
|
|
* characters literally, while ignoring case differences). On
|
|
|
|
* the other hand, even without -F, if the pattern does not
|
|
|
|
* have any regexp special characters and there is no need for
|
|
|
|
* case-folding search, we can internally turn it into a
|
|
|
|
* simple string match using kws. p->fixed tells us if we
|
|
|
|
* want to use kws.
|
|
|
|
*/
|
2017-05-25 21:45:27 +02:00
|
|
|
if (opt->fixed ||
|
|
|
|
has_null(p->pattern, p->patternlen) ||
|
|
|
|
is_fixed(p->pattern, p->patternlen))
|
2017-06-30 00:22:21 +02:00
|
|
|
p->fixed = !p->ignore_case || ascii_only;
|
Use kwset in grep
Benchmarks for the hot cache case:
before:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
3,478,085 cache-misses # 2.322 M/sec ( +- 2.690% )
11,356,177 cache-references # 7.582 M/sec ( +- 2.598% )
3,872,184 branch-misses # 0.363 % ( +- 0.258% )
1,067,367,848 branches # 712.673 M/sec ( +- 2.622% )
3,828,370,782 instructions # 0.947 IPC ( +- 0.033% )
4,043,832,831 cycles # 2700.037 M/sec ( +- 0.167% )
8,518 page-faults # 0.006 M/sec ( +- 3.648% )
847 CPU-migrations # 0.001 M/sec ( +- 3.262% )
6,546 context-switches # 0.004 M/sec ( +- 2.292% )
1497.695495 task-clock-msecs # 3.303 CPUs ( +- 2.550% )
0.453394396 seconds time elapsed ( +- 0.912% )
after:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
2,989,918 cache-misses # 3.166 M/sec ( +- 5.013% )
10,986,041 cache-references # 11.633 M/sec ( +- 4.899% ) (scaled from 95.06%)
3,511,993 branch-misses # 1.422 % ( +- 0.785% )
246,893,561 branches # 261.433 M/sec ( +- 3.967% )
1,392,727,757 instructions # 0.564 IPC ( +- 0.040% )
2,468,142,397 cycles # 2613.494 M/sec ( +- 0.110% )
7,747 page-faults # 0.008 M/sec ( +- 3.995% )
897 CPU-migrations # 0.001 M/sec ( +- 2.383% )
6,535 context-switches # 0.007 M/sec ( +- 1.993% )
944.384228 task-clock-msecs # 3.177 CPUs ( +- 0.268% )
0.297257643 seconds time elapsed ( +- 0.450% )
So we gain about 35% by using the kwset code.
As a side effect of using kwset two grep tests are fixed by this
patch. The first is fixed because kwset can deal with case-insensitive
search containing NULs, something strcasestr cannot do. The second one
is fixed because we consider patterns containing NULs as fixed strings
(regcomp cannot accept patterns with NULs).
Signed-off-by: Fredrik Kuivinen <frekui@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-08-21 00:42:18 +02:00
|
|
|
|
|
|
|
if (p->fixed) {
|
2017-06-30 00:22:21 +02:00
|
|
|
p->kws = kwsalloc(p->ignore_case ? tolower_trans_tbl : NULL);
|
Use kwset in grep
Benchmarks for the hot cache case:
before:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
3,478,085 cache-misses # 2.322 M/sec ( +- 2.690% )
11,356,177 cache-references # 7.582 M/sec ( +- 2.598% )
3,872,184 branch-misses # 0.363 % ( +- 0.258% )
1,067,367,848 branches # 712.673 M/sec ( +- 2.622% )
3,828,370,782 instructions # 0.947 IPC ( +- 0.033% )
4,043,832,831 cycles # 2700.037 M/sec ( +- 0.167% )
8,518 page-faults # 0.006 M/sec ( +- 3.648% )
847 CPU-migrations # 0.001 M/sec ( +- 3.262% )
6,546 context-switches # 0.004 M/sec ( +- 2.292% )
1497.695495 task-clock-msecs # 3.303 CPUs ( +- 2.550% )
0.453394396 seconds time elapsed ( +- 0.912% )
after:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
2,989,918 cache-misses # 3.166 M/sec ( +- 5.013% )
10,986,041 cache-references # 11.633 M/sec ( +- 4.899% ) (scaled from 95.06%)
3,511,993 branch-misses # 1.422 % ( +- 0.785% )
246,893,561 branches # 261.433 M/sec ( +- 3.967% )
1,392,727,757 instructions # 0.564 IPC ( +- 0.040% )
2,468,142,397 cycles # 2613.494 M/sec ( +- 0.110% )
7,747 page-faults # 0.008 M/sec ( +- 3.995% )
897 CPU-migrations # 0.001 M/sec ( +- 2.383% )
6,535 context-switches # 0.007 M/sec ( +- 1.993% )
944.384228 task-clock-msecs # 3.177 CPUs ( +- 0.268% )
0.297257643 seconds time elapsed ( +- 0.450% )
So we gain about 35% by using the kwset code.
As a side effect of using kwset two grep tests are fixed by this
patch. The first is fixed because kwset can deal with case-insensitive
search containing NULs, something strcasestr cannot do. The second one
is fixed because we consider patterns containing NULs as fixed strings
(regcomp cannot accept patterns with NULs).
Signed-off-by: Fredrik Kuivinen <frekui@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-08-21 00:42:18 +02:00
|
|
|
kwsincr(p->kws, p->pattern, p->patternlen);
|
|
|
|
kwsprep(p->kws);
|
2009-01-10 00:18:34 +01:00
|
|
|
return;
|
2016-06-25 07:22:31 +02:00
|
|
|
} else if (opt->fixed) {
|
|
|
|
/*
|
|
|
|
* We come here when the pattern has the non-ascii
|
|
|
|
* characters we cannot case-fold, and asked to
|
|
|
|
* ignore-case.
|
|
|
|
*/
|
|
|
|
compile_fixed_regexp(p, opt);
|
|
|
|
return;
|
Use kwset in grep
Benchmarks for the hot cache case:
before:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
3,478,085 cache-misses # 2.322 M/sec ( +- 2.690% )
11,356,177 cache-references # 7.582 M/sec ( +- 2.598% )
3,872,184 branch-misses # 0.363 % ( +- 0.258% )
1,067,367,848 branches # 712.673 M/sec ( +- 2.622% )
3,828,370,782 instructions # 0.947 IPC ( +- 0.033% )
4,043,832,831 cycles # 2700.037 M/sec ( +- 0.167% )
8,518 page-faults # 0.006 M/sec ( +- 3.648% )
847 CPU-migrations # 0.001 M/sec ( +- 3.262% )
6,546 context-switches # 0.004 M/sec ( +- 2.292% )
1497.695495 task-clock-msecs # 3.303 CPUs ( +- 2.550% )
0.453394396 seconds time elapsed ( +- 0.912% )
after:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
2,989,918 cache-misses # 3.166 M/sec ( +- 5.013% )
10,986,041 cache-references # 11.633 M/sec ( +- 4.899% ) (scaled from 95.06%)
3,511,993 branch-misses # 1.422 % ( +- 0.785% )
246,893,561 branches # 261.433 M/sec ( +- 3.967% )
1,392,727,757 instructions # 0.564 IPC ( +- 0.040% )
2,468,142,397 cycles # 2613.494 M/sec ( +- 0.110% )
7,747 page-faults # 0.008 M/sec ( +- 3.995% )
897 CPU-migrations # 0.001 M/sec ( +- 2.383% )
6,535 context-switches # 0.007 M/sec ( +- 1.993% )
944.384228 task-clock-msecs # 3.177 CPUs ( +- 0.268% )
0.297257643 seconds time elapsed ( +- 0.450% )
So we gain about 35% by using the kwset code.
As a side effect of using kwset two grep tests are fixed by this
patch. The first is fixed because kwset can deal with case-insensitive
search containing NULs, something strcasestr cannot do. The second one
is fixed because we consider patterns containing NULs as fixed strings
(regcomp cannot accept patterns with NULs).
Signed-off-by: Fredrik Kuivinen <frekui@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-08-21 00:42:18 +02:00
|
|
|
}
|
2009-01-10 00:18:34 +01:00
|
|
|
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
if (opt->pcre2) {
|
|
|
|
compile_pcre2_pattern(p, opt);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-05-25 21:45:29 +02:00
|
|
|
if (opt->pcre1) {
|
|
|
|
compile_pcre1_regexp(p, opt);
|
2011-05-09 23:52:05 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-06-30 00:22:21 +02:00
|
|
|
if (p->ignore_case)
|
|
|
|
regflags |= REG_ICASE;
|
|
|
|
if (opt->extended_regexp_option)
|
|
|
|
regflags |= REG_EXTENDED;
|
|
|
|
err = regcomp(&p->regexp, p->pattern, regflags);
|
2006-09-18 01:02:52 +02:00
|
|
|
if (err) {
|
|
|
|
char errbuf[1024];
|
|
|
|
regerror(err, &p->regexp, errbuf, 1024);
|
|
|
|
regfree(&p->regexp);
|
2011-05-09 23:52:04 +02:00
|
|
|
compile_regexp_failed(p, errbuf);
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-09-28 02:50:52 +02:00
|
|
|
static struct grep_expr *compile_pattern_or(struct grep_pat **);
|
2006-09-18 01:02:52 +02:00
|
|
|
static struct grep_expr *compile_pattern_atom(struct grep_pat **list)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
struct grep_expr *x;
|
|
|
|
|
|
|
|
p = *list;
|
2009-04-27 20:10:24 +02:00
|
|
|
if (!p)
|
|
|
|
return NULL;
|
2006-09-18 01:02:52 +02:00
|
|
|
switch (p->token) {
|
|
|
|
case GREP_PATTERN: /* atom */
|
2006-09-20 21:39:46 +02:00
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
case GREP_PATTERN_BODY:
|
2006-09-18 01:02:52 +02:00
|
|
|
x = xcalloc(1, sizeof (struct grep_expr));
|
|
|
|
x->node = GREP_NODE_ATOM;
|
|
|
|
x->u.atom = p;
|
|
|
|
*list = p->next;
|
|
|
|
return x;
|
|
|
|
case GREP_OPEN_PAREN:
|
|
|
|
*list = p->next;
|
2006-09-28 02:50:52 +02:00
|
|
|
x = compile_pattern_or(list);
|
2006-09-18 01:02:52 +02:00
|
|
|
if (!*list || (*list)->token != GREP_CLOSE_PAREN)
|
|
|
|
die("unmatched parenthesis");
|
|
|
|
*list = (*list)->next;
|
|
|
|
return x;
|
|
|
|
default:
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct grep_expr *compile_pattern_not(struct grep_pat **list)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
struct grep_expr *x;
|
|
|
|
|
|
|
|
p = *list;
|
2009-04-27 20:10:24 +02:00
|
|
|
if (!p)
|
|
|
|
return NULL;
|
2006-09-18 01:02:52 +02:00
|
|
|
switch (p->token) {
|
|
|
|
case GREP_NOT:
|
|
|
|
if (!p->next)
|
|
|
|
die("--not not followed by pattern expression");
|
|
|
|
*list = p->next;
|
|
|
|
x = xcalloc(1, sizeof (struct grep_expr));
|
|
|
|
x->node = GREP_NODE_NOT;
|
|
|
|
x->u.unary = compile_pattern_not(list);
|
|
|
|
if (!x->u.unary)
|
|
|
|
die("--not followed by non pattern expression");
|
|
|
|
return x;
|
|
|
|
default:
|
|
|
|
return compile_pattern_atom(list);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct grep_expr *compile_pattern_and(struct grep_pat **list)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
struct grep_expr *x, *y, *z;
|
|
|
|
|
|
|
|
x = compile_pattern_not(list);
|
|
|
|
p = *list;
|
|
|
|
if (p && p->token == GREP_AND) {
|
|
|
|
if (!p->next)
|
|
|
|
die("--and not followed by pattern expression");
|
|
|
|
*list = p->next;
|
|
|
|
y = compile_pattern_and(list);
|
|
|
|
if (!y)
|
|
|
|
die("--and not followed by pattern expression");
|
|
|
|
z = xcalloc(1, sizeof (struct grep_expr));
|
|
|
|
z->node = GREP_NODE_AND;
|
|
|
|
z->u.binary.left = x;
|
|
|
|
z->u.binary.right = y;
|
|
|
|
return z;
|
|
|
|
}
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct grep_expr *compile_pattern_or(struct grep_pat **list)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
struct grep_expr *x, *y, *z;
|
|
|
|
|
|
|
|
x = compile_pattern_and(list);
|
|
|
|
p = *list;
|
|
|
|
if (x && p && p->token != GREP_CLOSE_PAREN) {
|
|
|
|
y = compile_pattern_or(list);
|
|
|
|
if (!y)
|
|
|
|
die("not a pattern expression %s", p->pattern);
|
|
|
|
z = xcalloc(1, sizeof (struct grep_expr));
|
|
|
|
z->node = GREP_NODE_OR;
|
|
|
|
z->u.binary.left = x;
|
|
|
|
z->u.binary.right = y;
|
|
|
|
return z;
|
|
|
|
}
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct grep_expr *compile_pattern_expr(struct grep_pat **list)
|
|
|
|
{
|
|
|
|
return compile_pattern_or(list);
|
|
|
|
}
|
|
|
|
|
2012-09-13 23:21:44 +02:00
|
|
|
static void indent(int in)
|
|
|
|
{
|
|
|
|
while (in-- > 0)
|
|
|
|
fputc(' ', stderr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void dump_grep_pat(struct grep_pat *p)
|
|
|
|
{
|
|
|
|
switch (p->token) {
|
|
|
|
case GREP_AND: fprintf(stderr, "*and*"); break;
|
|
|
|
case GREP_OPEN_PAREN: fprintf(stderr, "*(*"); break;
|
|
|
|
case GREP_CLOSE_PAREN: fprintf(stderr, "*)*"); break;
|
|
|
|
case GREP_NOT: fprintf(stderr, "*not*"); break;
|
|
|
|
case GREP_OR: fprintf(stderr, "*or*"); break;
|
|
|
|
|
|
|
|
case GREP_PATTERN: fprintf(stderr, "pattern"); break;
|
|
|
|
case GREP_PATTERN_HEAD: fprintf(stderr, "pattern_head"); break;
|
|
|
|
case GREP_PATTERN_BODY: fprintf(stderr, "pattern_body"); break;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (p->token) {
|
|
|
|
default: break;
|
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
fprintf(stderr, "<head %d>", p->field); break;
|
|
|
|
case GREP_PATTERN_BODY:
|
|
|
|
fprintf(stderr, "<body>"); break;
|
|
|
|
}
|
|
|
|
switch (p->token) {
|
|
|
|
default: break;
|
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
case GREP_PATTERN_BODY:
|
|
|
|
case GREP_PATTERN:
|
|
|
|
fprintf(stderr, "%.*s", (int)p->patternlen, p->pattern);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
fputc('\n', stderr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void dump_grep_expression_1(struct grep_expr *x, int in)
|
|
|
|
{
|
|
|
|
indent(in);
|
|
|
|
switch (x->node) {
|
|
|
|
case GREP_NODE_TRUE:
|
|
|
|
fprintf(stderr, "true\n");
|
|
|
|
break;
|
|
|
|
case GREP_NODE_ATOM:
|
|
|
|
dump_grep_pat(x->u.atom);
|
|
|
|
break;
|
|
|
|
case GREP_NODE_NOT:
|
|
|
|
fprintf(stderr, "(not\n");
|
|
|
|
dump_grep_expression_1(x->u.unary, in+1);
|
|
|
|
indent(in);
|
|
|
|
fprintf(stderr, ")\n");
|
|
|
|
break;
|
|
|
|
case GREP_NODE_AND:
|
|
|
|
fprintf(stderr, "(and\n");
|
|
|
|
dump_grep_expression_1(x->u.binary.left, in+1);
|
|
|
|
dump_grep_expression_1(x->u.binary.right, in+1);
|
|
|
|
indent(in);
|
|
|
|
fprintf(stderr, ")\n");
|
|
|
|
break;
|
|
|
|
case GREP_NODE_OR:
|
|
|
|
fprintf(stderr, "(or\n");
|
|
|
|
dump_grep_expression_1(x->u.binary.left, in+1);
|
|
|
|
dump_grep_expression_1(x->u.binary.right, in+1);
|
|
|
|
indent(in);
|
|
|
|
fprintf(stderr, ")\n");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-09-15 23:04:36 +02:00
|
|
|
static void dump_grep_expression(struct grep_opt *opt)
|
2012-09-13 23:21:44 +02:00
|
|
|
{
|
|
|
|
struct grep_expr *x = opt->pattern_expression;
|
|
|
|
|
|
|
|
if (opt->all_match)
|
|
|
|
fprintf(stderr, "[all-match]\n");
|
|
|
|
dump_grep_expression_1(x, 0);
|
|
|
|
fflush(NULL);
|
|
|
|
}
|
|
|
|
|
2010-09-13 07:15:35 +02:00
|
|
|
static struct grep_expr *grep_true_expr(void)
|
|
|
|
{
|
|
|
|
struct grep_expr *z = xcalloc(1, sizeof(*z));
|
|
|
|
z->node = GREP_NODE_TRUE;
|
|
|
|
return z;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct grep_expr *grep_or_expr(struct grep_expr *left, struct grep_expr *right)
|
|
|
|
{
|
|
|
|
struct grep_expr *z = xcalloc(1, sizeof(*z));
|
|
|
|
z->node = GREP_NODE_OR;
|
|
|
|
z->u.binary.left = left;
|
|
|
|
z->u.binary.right = right;
|
|
|
|
return z;
|
|
|
|
}
|
|
|
|
|
2010-09-13 04:30:48 +02:00
|
|
|
static struct grep_expr *prep_header_patterns(struct grep_opt *opt)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
2010-09-13 04:30:48 +02:00
|
|
|
struct grep_expr *header_expr;
|
2010-09-13 07:15:35 +02:00
|
|
|
struct grep_expr *(header_group[GREP_HEADER_FIELD_MAX]);
|
|
|
|
enum grep_header_field fld;
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2010-09-13 04:30:48 +02:00
|
|
|
if (!opt->header_list)
|
|
|
|
return NULL;
|
2012-05-06 20:17:15 +02:00
|
|
|
|
2010-09-13 04:30:48 +02:00
|
|
|
for (p = opt->header_list; p; p = p->next) {
|
|
|
|
if (p->token != GREP_PATTERN_HEAD)
|
2016-07-26 18:05:50 +02:00
|
|
|
die("BUG: a non-header pattern in grep header list.");
|
2013-02-03 15:37:09 +01:00
|
|
|
if (p->field < GREP_HEADER_FIELD_MIN ||
|
|
|
|
GREP_HEADER_FIELD_MAX <= p->field)
|
2016-07-26 18:05:50 +02:00
|
|
|
die("BUG: unknown header field %d", p->field);
|
2010-09-13 04:30:48 +02:00
|
|
|
compile_regexp(p, opt);
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
}
|
2010-09-13 07:15:35 +02:00
|
|
|
|
|
|
|
for (fld = 0; fld < GREP_HEADER_FIELD_MAX; fld++)
|
|
|
|
header_group[fld] = NULL;
|
|
|
|
|
|
|
|
for (p = opt->header_list; p; p = p->next) {
|
|
|
|
struct grep_expr *h;
|
|
|
|
struct grep_pat *pp = p;
|
|
|
|
|
|
|
|
h = compile_pattern_atom(&pp);
|
|
|
|
if (!h || pp != p->next)
|
2016-07-26 18:05:50 +02:00
|
|
|
die("BUG: malformed header expr");
|
2010-09-13 07:15:35 +02:00
|
|
|
if (!header_group[p->field]) {
|
|
|
|
header_group[p->field] = h;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
header_group[p->field] = grep_or_expr(h, header_group[p->field]);
|
|
|
|
}
|
|
|
|
|
|
|
|
header_expr = NULL;
|
|
|
|
|
|
|
|
for (fld = 0; fld < GREP_HEADER_FIELD_MAX; fld++) {
|
|
|
|
if (!header_group[fld])
|
|
|
|
continue;
|
|
|
|
if (!header_expr)
|
|
|
|
header_expr = grep_true_expr();
|
|
|
|
header_expr = grep_or_expr(header_group[fld], header_expr);
|
|
|
|
}
|
2010-09-13 04:30:48 +02:00
|
|
|
return header_expr;
|
|
|
|
}
|
|
|
|
|
log --grep/--author: honor --all-match honored for multiple --grep patterns
When we have both header expression (which has to be an OR node by
construction) and a pattern expression (which could be anything), we
create a new top-level OR node to bind them together, and the
resulting expression structure looks like this:
OR
/ \
/ \
pattern OR
/ \ / \
..... committer OR
/ \
author TRUE
The three elements on the top-level backbone that are inspected by
the "all-match" logic are "pattern", "committer" and "author". When
there are more than one elements in the "pattern", the top-level
node of the "pattern" part of the subtree is an OR, and that node is
inspected by "all-match".
The result ends up ignoring the "--all-match" given from the command
line. A match on either side of the pattern is considered a match,
hence:
git log --grep=A --grep=B --author=C --all-match
shows the same "authored by C and has either A or B" that is correct
only when run without "--all-match".
Fix this by turning the resulting expression around when "--all-match"
is in effect, like this:
OR
/ \
/ \
/ OR
committer / \
author \
pattern
The set of nodes on the top-level backbone in the resulting
expression becomes "committer", "author", and the nodes that are on
the top-level backbone of the "pattern" subexpression. This makes
the "all-match" logic inspect the same nodes in "pattern" as the
case without the author and/or the committer restriction, and makes
the earlier "log" example to show "authored by C and has A and has
B", which is what the command line expects.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-09-14 01:26:57 +02:00
|
|
|
static struct grep_expr *grep_splice_or(struct grep_expr *x, struct grep_expr *y)
|
|
|
|
{
|
|
|
|
struct grep_expr *z = x;
|
|
|
|
|
|
|
|
while (x) {
|
|
|
|
assert(x->node == GREP_NODE_OR);
|
|
|
|
if (x->u.binary.right &&
|
|
|
|
x->u.binary.right->node == GREP_NODE_TRUE) {
|
|
|
|
x->u.binary.right = y;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
x = x->u.binary.right;
|
|
|
|
}
|
|
|
|
return z;
|
|
|
|
}
|
|
|
|
|
2012-09-13 23:21:44 +02:00
|
|
|
static void compile_grep_patterns_real(struct grep_opt *opt)
|
2010-09-13 04:30:48 +02:00
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
struct grep_expr *header_expr = prep_header_patterns(opt);
|
2006-09-28 02:50:52 +02:00
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
for (p = opt->pattern_list; p; p = p->next) {
|
2006-09-20 21:39:46 +02:00
|
|
|
switch (p->token) {
|
|
|
|
case GREP_PATTERN: /* atom */
|
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
case GREP_PATTERN_BODY:
|
2009-01-10 00:18:34 +01:00
|
|
|
compile_regexp(p, opt);
|
2006-09-20 21:39:46 +02:00
|
|
|
break;
|
|
|
|
default:
|
2006-09-18 01:02:52 +02:00
|
|
|
opt->extended = 1;
|
2006-09-20 21:39:46 +02:00
|
|
|
break;
|
|
|
|
}
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
if (opt->all_match || header_expr)
|
|
|
|
opt->extended = 1;
|
2012-09-13 23:21:44 +02:00
|
|
|
else if (!opt->extended && !opt->debug)
|
2006-09-18 01:02:52 +02:00
|
|
|
return;
|
|
|
|
|
|
|
|
p = opt->pattern_list;
|
2009-03-18 21:53:27 +01:00
|
|
|
if (p)
|
|
|
|
opt->pattern_expression = compile_pattern_expr(&p);
|
2006-09-18 01:02:52 +02:00
|
|
|
if (p)
|
|
|
|
die("incomplete pattern expression: %s", p->pattern);
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
|
|
|
|
if (!header_expr)
|
|
|
|
return;
|
|
|
|
|
2010-09-13 07:15:35 +02:00
|
|
|
if (!opt->pattern_expression)
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
opt->pattern_expression = header_expr;
|
log --grep/--author: honor --all-match honored for multiple --grep patterns
When we have both header expression (which has to be an OR node by
construction) and a pattern expression (which could be anything), we
create a new top-level OR node to bind them together, and the
resulting expression structure looks like this:
OR
/ \
/ \
pattern OR
/ \ / \
..... committer OR
/ \
author TRUE
The three elements on the top-level backbone that are inspected by
the "all-match" logic are "pattern", "committer" and "author". When
there are more than one elements in the "pattern", the top-level
node of the "pattern" part of the subtree is an OR, and that node is
inspected by "all-match".
The result ends up ignoring the "--all-match" given from the command
line. A match on either side of the pattern is considered a match,
hence:
git log --grep=A --grep=B --author=C --all-match
shows the same "authored by C and has either A or B" that is correct
only when run without "--all-match".
Fix this by turning the resulting expression around when "--all-match"
is in effect, like this:
OR
/ \
/ \
/ OR
committer / \
author \
pattern
The set of nodes on the top-level backbone in the resulting
expression becomes "committer", "author", and the nodes that are on
the top-level backbone of the "pattern" subexpression. This makes
the "all-match" logic inspect the same nodes in "pattern" as the
case without the author and/or the committer restriction, and makes
the earlier "log" example to show "authored by C and has A and has
B", which is what the command line expects.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-09-14 01:26:57 +02:00
|
|
|
else if (opt->all_match)
|
|
|
|
opt->pattern_expression = grep_splice_or(header_expr,
|
|
|
|
opt->pattern_expression);
|
2010-09-13 07:15:35 +02:00
|
|
|
else
|
|
|
|
opt->pattern_expression = grep_or_expr(opt->pattern_expression,
|
|
|
|
header_expr);
|
"log --author=me --grep=it" should find intersection, not union
Historically, any grep filter in "git log" family of commands were taken
as restricting to commits with any of the words in the commit log message.
However, the user almost always want to find commits "done by this person
on that topic". With "--all-match" option, a series of grep patterns can
be turned into a requirement that all of them must produce a match, but
that makes it impossible to ask for "done by me, on either this or that"
with:
log --author=me --committer=him --grep=this --grep=that
because it will require both "this" and "that" to appear.
Change the "header" parser of grep library to treat the headers specially,
and parse it as:
(all-match-OR (HEADER-AUTHOR me)
(HEADER-COMMITTER him)
(OR
(PATTERN this)
(PATTERN that) ) )
Even though the "log" command line parser doesn't give direct access to
the extended grep syntax to group terms with parentheses, this change will
cover the majority of the case the users would want.
This incidentally revealed that one test in t7002 was bogus. It ran:
log --author=Thor --grep=Thu --format='%s'
and expected (wrongly) "Thu" to match "Thursday" in the author/committer
date, but that would never match, as the timestamp in raw commit buffer
does not have the name of the day-of-the-week.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-18 05:09:06 +01:00
|
|
|
opt->all_match = 1;
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2012-09-13 23:21:44 +02:00
|
|
|
void compile_grep_patterns(struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
compile_grep_patterns_real(opt);
|
|
|
|
if (opt->debug)
|
|
|
|
dump_grep_expression(opt);
|
|
|
|
}
|
|
|
|
|
2006-09-28 01:27:10 +02:00
|
|
|
static void free_pattern_expr(struct grep_expr *x)
|
|
|
|
{
|
|
|
|
switch (x->node) {
|
2010-09-13 07:15:35 +02:00
|
|
|
case GREP_NODE_TRUE:
|
2006-09-28 01:27:10 +02:00
|
|
|
case GREP_NODE_ATOM:
|
|
|
|
break;
|
|
|
|
case GREP_NODE_NOT:
|
|
|
|
free_pattern_expr(x->u.unary);
|
|
|
|
break;
|
|
|
|
case GREP_NODE_AND:
|
|
|
|
case GREP_NODE_OR:
|
|
|
|
free_pattern_expr(x->u.binary.left);
|
|
|
|
free_pattern_expr(x->u.binary.right);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
free(x);
|
|
|
|
}
|
|
|
|
|
|
|
|
void free_grep_patterns(struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
struct grep_pat *p, *n;
|
|
|
|
|
|
|
|
for (p = opt->pattern_list; p; p = n) {
|
|
|
|
n = p->next;
|
|
|
|
switch (p->token) {
|
|
|
|
case GREP_PATTERN: /* atom */
|
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
case GREP_PATTERN_BODY:
|
Use kwset in grep
Benchmarks for the hot cache case:
before:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
3,478,085 cache-misses # 2.322 M/sec ( +- 2.690% )
11,356,177 cache-references # 7.582 M/sec ( +- 2.598% )
3,872,184 branch-misses # 0.363 % ( +- 0.258% )
1,067,367,848 branches # 712.673 M/sec ( +- 2.622% )
3,828,370,782 instructions # 0.947 IPC ( +- 0.033% )
4,043,832,831 cycles # 2700.037 M/sec ( +- 0.167% )
8,518 page-faults # 0.006 M/sec ( +- 3.648% )
847 CPU-migrations # 0.001 M/sec ( +- 3.262% )
6,546 context-switches # 0.004 M/sec ( +- 2.292% )
1497.695495 task-clock-msecs # 3.303 CPUs ( +- 2.550% )
0.453394396 seconds time elapsed ( +- 0.912% )
after:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
2,989,918 cache-misses # 3.166 M/sec ( +- 5.013% )
10,986,041 cache-references # 11.633 M/sec ( +- 4.899% ) (scaled from 95.06%)
3,511,993 branch-misses # 1.422 % ( +- 0.785% )
246,893,561 branches # 261.433 M/sec ( +- 3.967% )
1,392,727,757 instructions # 0.564 IPC ( +- 0.040% )
2,468,142,397 cycles # 2613.494 M/sec ( +- 0.110% )
7,747 page-faults # 0.008 M/sec ( +- 3.995% )
897 CPU-migrations # 0.001 M/sec ( +- 2.383% )
6,535 context-switches # 0.007 M/sec ( +- 1.993% )
944.384228 task-clock-msecs # 3.177 CPUs ( +- 0.268% )
0.297257643 seconds time elapsed ( +- 0.450% )
So we gain about 35% by using the kwset code.
As a side effect of using kwset two grep tests are fixed by this
patch. The first is fixed because kwset can deal with case-insensitive
search containing NULs, something strcasestr cannot do. The second one
is fixed because we consider patterns containing NULs as fixed strings
(regcomp cannot accept patterns with NULs).
Signed-off-by: Fredrik Kuivinen <frekui@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-08-21 00:42:18 +02:00
|
|
|
if (p->kws)
|
|
|
|
kwsfree(p->kws);
|
2017-05-25 21:45:29 +02:00
|
|
|
else if (p->pcre1_regexp)
|
|
|
|
free_pcre1_regexp(p);
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
else if (p->pcre2_pattern)
|
|
|
|
free_pcre2_pattern(p);
|
2011-05-09 23:52:05 +02:00
|
|
|
else
|
|
|
|
regfree(&p->regexp);
|
2012-05-20 16:33:07 +02:00
|
|
|
free(p->pattern);
|
2006-09-28 01:27:10 +02:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
free(p);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!opt->extended)
|
|
|
|
return;
|
|
|
|
free_pattern_expr(opt->pattern_expression);
|
|
|
|
}
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
static char *end_of_line(char *cp, unsigned long *left)
|
|
|
|
{
|
|
|
|
unsigned long l = *left;
|
|
|
|
while (l && *cp != '\n') {
|
|
|
|
l--;
|
|
|
|
cp++;
|
|
|
|
}
|
|
|
|
*left = l;
|
|
|
|
return cp;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int word_char(char ch)
|
|
|
|
{
|
|
|
|
return isalnum(ch) || ch == '_';
|
|
|
|
}
|
|
|
|
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
static void output_color(struct grep_opt *opt, const void *data, size_t size,
|
|
|
|
const char *color)
|
|
|
|
{
|
color: delay auto-color decision until point of use
When we read a color value either from a config file or from
the command line, we use git_config_colorbool to convert it
from the tristate always/never/auto into a single yes/no
boolean value.
This has some timing implications with respect to starting
a pager.
If we start (or decide not to start) the pager before
checking the colorbool, everything is fine. Either isatty(1)
will give us the right information, or we will properly
check for pager_in_use().
However, if we decide to start a pager after we have checked
the colorbool, things are not so simple. If stdout is a tty,
then we will have already decided to use color. However, the
user may also have configured color.pager not to use color
with the pager. In this case, we need to actually turn off
color. Unfortunately, the pager code has no idea which color
variables were turned on (and there are many of them
throughout the code, and they may even have been manipulated
after the colorbool selection by something like "--color" on
the command line).
This bug can be seen any time a pager is started after
config and command line options are checked. This has
affected "git diff" since 89d07f7 (diff: don't run pager if
user asked for a diff style exit code, 2007-08-12). It has
also affect the log family since 1fda91b (Fix 'git log'
early pager startup error case, 2010-08-24).
This patch splits the notion of parsing a colorbool and
actually checking the configuration. The "use_color"
variables now have an additional possible value,
GIT_COLOR_AUTO. Users of the variable should use the new
"want_color()" wrapper, which will lazily determine and
cache the auto-color decision.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-08-18 07:04:23 +02:00
|
|
|
if (want_color(opt->color) && color && color[0]) {
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
opt->output(opt, color, strlen(color));
|
|
|
|
opt->output(opt, data, size);
|
|
|
|
opt->output(opt, GIT_COLOR_RESET, strlen(GIT_COLOR_RESET));
|
|
|
|
} else
|
|
|
|
opt->output(opt, data, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void output_sep(struct grep_opt *opt, char sign)
|
|
|
|
{
|
|
|
|
if (opt->null_following_name)
|
|
|
|
opt->output(opt, "\0", 1);
|
|
|
|
else
|
|
|
|
output_color(opt, &sign, 1, opt->color_sep);
|
|
|
|
}
|
|
|
|
|
2008-10-01 18:11:15 +02:00
|
|
|
static void show_name(struct grep_opt *opt, const char *name)
|
|
|
|
{
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
output_color(opt, name, strlen(name), opt->color_filename);
|
2010-01-25 23:51:39 +01:00
|
|
|
opt->output(opt, opt->null_following_name ? "\0" : "\n", 1);
|
2008-10-01 18:11:15 +02:00
|
|
|
}
|
|
|
|
|
2010-05-22 23:43:43 +02:00
|
|
|
static int fixmatch(struct grep_pat *p, char *line, char *eol,
|
|
|
|
regmatch_t *match)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
Use kwset in grep
Benchmarks for the hot cache case:
before:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
3,478,085 cache-misses # 2.322 M/sec ( +- 2.690% )
11,356,177 cache-references # 7.582 M/sec ( +- 2.598% )
3,872,184 branch-misses # 0.363 % ( +- 0.258% )
1,067,367,848 branches # 712.673 M/sec ( +- 2.622% )
3,828,370,782 instructions # 0.947 IPC ( +- 0.033% )
4,043,832,831 cycles # 2700.037 M/sec ( +- 0.167% )
8,518 page-faults # 0.006 M/sec ( +- 3.648% )
847 CPU-migrations # 0.001 M/sec ( +- 3.262% )
6,546 context-switches # 0.004 M/sec ( +- 2.292% )
1497.695495 task-clock-msecs # 3.303 CPUs ( +- 2.550% )
0.453394396 seconds time elapsed ( +- 0.912% )
after:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
2,989,918 cache-misses # 3.166 M/sec ( +- 5.013% )
10,986,041 cache-references # 11.633 M/sec ( +- 4.899% ) (scaled from 95.06%)
3,511,993 branch-misses # 1.422 % ( +- 0.785% )
246,893,561 branches # 261.433 M/sec ( +- 3.967% )
1,392,727,757 instructions # 0.564 IPC ( +- 0.040% )
2,468,142,397 cycles # 2613.494 M/sec ( +- 0.110% )
7,747 page-faults # 0.008 M/sec ( +- 3.995% )
897 CPU-migrations # 0.001 M/sec ( +- 2.383% )
6,535 context-switches # 0.007 M/sec ( +- 1.993% )
944.384228 task-clock-msecs # 3.177 CPUs ( +- 0.268% )
0.297257643 seconds time elapsed ( +- 0.450% )
So we gain about 35% by using the kwset code.
As a side effect of using kwset two grep tests are fixed by this
patch. The first is fixed because kwset can deal with case-insensitive
search containing NULs, something strcasestr cannot do. The second one
is fixed because we consider patterns containing NULs as fixed strings
(regcomp cannot accept patterns with NULs).
Signed-off-by: Fredrik Kuivinen <frekui@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-08-21 00:42:18 +02:00
|
|
|
struct kwsmatch kwsm;
|
|
|
|
size_t offset = kwsexec(p->kws, line, eol - line, &kwsm);
|
|
|
|
if (offset == -1) {
|
2006-09-18 01:02:52 +02:00
|
|
|
match->rm_so = match->rm_eo = -1;
|
|
|
|
return REG_NOMATCH;
|
Use kwset in grep
Benchmarks for the hot cache case:
before:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
3,478,085 cache-misses # 2.322 M/sec ( +- 2.690% )
11,356,177 cache-references # 7.582 M/sec ( +- 2.598% )
3,872,184 branch-misses # 0.363 % ( +- 0.258% )
1,067,367,848 branches # 712.673 M/sec ( +- 2.622% )
3,828,370,782 instructions # 0.947 IPC ( +- 0.033% )
4,043,832,831 cycles # 2700.037 M/sec ( +- 0.167% )
8,518 page-faults # 0.006 M/sec ( +- 3.648% )
847 CPU-migrations # 0.001 M/sec ( +- 3.262% )
6,546 context-switches # 0.004 M/sec ( +- 2.292% )
1497.695495 task-clock-msecs # 3.303 CPUs ( +- 2.550% )
0.453394396 seconds time elapsed ( +- 0.912% )
after:
$ perf stat --repeat=5 git grep qwerty > /dev/null
Performance counter stats for 'git grep qwerty' (5 runs):
2,989,918 cache-misses # 3.166 M/sec ( +- 5.013% )
10,986,041 cache-references # 11.633 M/sec ( +- 4.899% ) (scaled from 95.06%)
3,511,993 branch-misses # 1.422 % ( +- 0.785% )
246,893,561 branches # 261.433 M/sec ( +- 3.967% )
1,392,727,757 instructions # 0.564 IPC ( +- 0.040% )
2,468,142,397 cycles # 2613.494 M/sec ( +- 0.110% )
7,747 page-faults # 0.008 M/sec ( +- 3.995% )
897 CPU-migrations # 0.001 M/sec ( +- 2.383% )
6,535 context-switches # 0.007 M/sec ( +- 1.993% )
944.384228 task-clock-msecs # 3.177 CPUs ( +- 0.268% )
0.297257643 seconds time elapsed ( +- 0.450% )
So we gain about 35% by using the kwset code.
As a side effect of using kwset two grep tests are fixed by this
patch. The first is fixed because kwset can deal with case-insensitive
search containing NULs, something strcasestr cannot do. The second one
is fixed because we consider patterns containing NULs as fixed strings
(regcomp cannot accept patterns with NULs).
Signed-off-by: Fredrik Kuivinen <frekui@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-08-21 00:42:18 +02:00
|
|
|
} else {
|
|
|
|
match->rm_so = offset;
|
|
|
|
match->rm_eo = match->rm_so + kwsm.size[0];
|
2006-09-18 01:02:52 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-05-05 00:00:19 +02:00
|
|
|
static int patmatch(struct grep_pat *p, char *line, char *eol,
|
|
|
|
regmatch_t *match, int eflags)
|
|
|
|
{
|
|
|
|
int hit;
|
|
|
|
|
|
|
|
if (p->fixed)
|
|
|
|
hit = !fixmatch(p, line, eol, match);
|
2017-05-25 21:45:29 +02:00
|
|
|
else if (p->pcre1_regexp)
|
|
|
|
hit = !pcre1match(p, line, eol, match, eflags);
|
grep: add support for PCRE v2
Add support for v2 of the PCRE API. This is a new major version of
PCRE that came out in early 2015[1].
The regular expression syntax is the same, but while the API is
similar, pretty much every function is either renamed or takes
different arguments. Thus using it via entirely new functions makes
sense, as opposed to trying to e.g. have one compile_pcre_pattern()
that would call either PCRE v1 or v2 functions.
Git can now be compiled with either USE_LIBPCRE1=YesPlease or
USE_LIBPCRE2=YesPlease, with USE_LIBPCRE=YesPlease currently being a
synonym for the former. Providing both is a compile-time error.
With earlier patches to enable JIT for PCRE v1 the performance of the
release versions of both libraries is almost exactly the same, with
PCRE v2 being around 1% slower.
However after I reported this to the pcre-dev mailing list[2] I got a
lot of help with the API use from Zoltán Herczeg, he subsequently
optimized some of the JIT functionality in v2 of the library.
Running the p7820-grep-engines.sh performance test against the latest
Subversion trunk of both, with both them and git compiled as -O3, and
the test run against linux.git, gives the following results. Just the
/perl/ tests shown:
$ GIT_PERF_REPEAT_COUNT=30 GIT_PERF_LARGE_REPO=~/g/linux GIT_PERF_MAKE_COMMAND='grep -q LIBPCRE2 Makefile && make -j8 USE_LIBPCRE2=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre2/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre2/inst/lib || make -j8 USE_LIBPCRE=YesPlease CC=~/perl5/installed/bin/gcc NO_R_TO_GCC_LINKER=YesPlease CFLAGS=-O3 LIBPCREDIR=/home/avar/g/pcre/inst LDFLAGS=-Wl,-rpath,/home/avar/g/pcre/inst/lib' ./run HEAD~5 HEAD~ HEAD p7820-grep-engines.sh
[...]
Test HEAD~5 HEAD~ HEAD
-----------------------------------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.31(1.10+0.48) 0.21(0.35+0.56) -32.3% 0.21(0.34+0.55) -32.3%
7820.7: perl grep '^how to' 0.56(2.70+0.40) 0.24(0.64+0.52) -57.1% 0.20(0.28+0.60) -64.3%
7820.11: perl grep '[how] to' 0.56(2.66+0.38) 0.29(0.95+0.45) -48.2% 0.23(0.45+0.54) -58.9%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 1.02(5.77+0.42) 0.31(1.02+0.54) -69.6% 0.23(0.50+0.54) -77.5%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.38(1.57+0.42) 0.27(0.85+0.46) -28.9% 0.21(0.33+0.57) -44.7%
See commit ("perf: add a comparison test of grep regex engines",
2017-04-19) for details on the machine the above test run was executed
on.
Here HEAD~2 is git with PCRE v1 without JIT, HEAD~ is PCRE v1 with
JIT, and HEAD is PCRE v2 (also with JIT). See previous commits of mine
mentioning p7820-grep-engines.sh for more details on the test setup.
For ease of readability, a different run just of HEAD~ (PCRE v1 with
JIT v.s. PCRE v2), again with just the /perl/ tests shown:
[...]
Test HEAD~ HEAD
----------------------------------------------------------------------------------------
7820.3: perl grep 'how.to' 0.21(0.42+0.52) 0.21(0.31+0.58) +0.0%
7820.7: perl grep '^how to' 0.25(0.65+0.50) 0.20(0.31+0.57) -20.0%
7820.11: perl grep '[how] to' 0.30(0.90+0.50) 0.23(0.46+0.53) -23.3%
7820.15: perl grep '(e.t[^ ]*|v.ry) rare' 0.30(1.19+0.38) 0.23(0.51+0.51) -23.3%
7820.19: perl grep 'm(ú|u)lt.b(æ|y)te' 0.27(0.84+0.48) 0.21(0.34+0.57) -22.2%
I.e. the two are either neck-to-neck, but PCRE v2 usually pulls ahead,
when it does it's around 20% faster.
A brief note on thread safety: As noted in pcre2api(3) & pcre2jit(3)
the compiled pattern can be shared between threads, but not some of
the JIT context, however the grep threading support does all pattern &
JIT compilation in separate threads, so this code doesn't need to
concern itself with thread safety.
See commit 63e7e9d8b6 ("git-grep: Learn PCRE", 2011-05-09) for the
initial addition of PCRE v1. This change follows some of the same
patterns it did (and which were discussed on list at the time),
e.g. mocking up types with typedef instead of ifdef-ing them out when
USE_LIBPCRE2 isn't defined. This adds some trivial memory use to the
program, but makes the code look nicer.
1. https://lists.exim.org/lurker/message/20150105.162835.0666407a.en.html
2. https://lists.exim.org/lurker/thread/20170419.172322.833ee099.en.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-06-01 20:20:56 +02:00
|
|
|
else if (p->pcre2_pattern)
|
|
|
|
hit = !pcre2match(p, line, eol, match, eflags);
|
2011-05-05 00:00:19 +02:00
|
|
|
else
|
2016-09-21 20:24:14 +02:00
|
|
|
hit = !regexec_buf(&p->regexp, line, eol - line, 1, match,
|
|
|
|
eflags);
|
2011-05-05 00:00:19 +02:00
|
|
|
|
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
static int strip_timestamp(char *bol, char **eol_p)
|
|
|
|
{
|
|
|
|
char *eol = *eol_p;
|
|
|
|
int ch;
|
|
|
|
|
|
|
|
while (bol < --eol) {
|
|
|
|
if (*eol != '>')
|
|
|
|
continue;
|
|
|
|
*eol_p = ++eol;
|
|
|
|
ch = *eol;
|
|
|
|
*eol = '\0';
|
|
|
|
return ch;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct {
|
|
|
|
const char *field;
|
|
|
|
size_t len;
|
|
|
|
} header_field[] = {
|
|
|
|
{ "author ", 7 },
|
|
|
|
{ "committer ", 10 },
|
2012-09-29 06:41:28 +02:00
|
|
|
{ "reflog ", 7 },
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
};
|
|
|
|
|
2009-03-07 13:28:40 +01:00
|
|
|
static int match_one_pattern(struct grep_pat *p, char *bol, char *eol,
|
2009-03-07 13:30:27 +01:00
|
|
|
enum grep_context ctx,
|
|
|
|
regmatch_t *pmatch, int eflags)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
|
|
|
int hit = 0;
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
int saved_ch = 0;
|
2009-05-20 23:31:53 +02:00
|
|
|
const char *start = bol;
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2006-09-20 21:39:46 +02:00
|
|
|
if ((p->token != GREP_PATTERN) &&
|
|
|
|
((p->token == GREP_PATTERN_HEAD) != (ctx == GREP_CONTEXT_HEAD)))
|
|
|
|
return 0;
|
|
|
|
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
if (p->token == GREP_PATTERN_HEAD) {
|
|
|
|
const char *field;
|
|
|
|
size_t len;
|
|
|
|
assert(p->field < ARRAY_SIZE(header_field));
|
|
|
|
field = header_field[p->field].field;
|
|
|
|
len = header_field[p->field].len;
|
|
|
|
if (strncmp(bol, field, len))
|
|
|
|
return 0;
|
|
|
|
bol += len;
|
2012-09-29 06:41:27 +02:00
|
|
|
switch (p->field) {
|
|
|
|
case GREP_HEADER_AUTHOR:
|
|
|
|
case GREP_HEADER_COMMITTER:
|
|
|
|
saved_ch = strip_timestamp(bol, &eol);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
}
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
again:
|
2011-05-05 00:00:19 +02:00
|
|
|
hit = patmatch(p, bol, eol, pmatch, eflags);
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2009-03-07 13:28:40 +01:00
|
|
|
if (hit && p->word_regexp) {
|
2006-09-18 01:02:52 +02:00
|
|
|
if ((pmatch[0].rm_so < 0) ||
|
2009-06-03 18:19:01 +02:00
|
|
|
(eol - bol) < pmatch[0].rm_so ||
|
2006-09-18 01:02:52 +02:00
|
|
|
(pmatch[0].rm_eo < 0) ||
|
|
|
|
(eol - bol) < pmatch[0].rm_eo)
|
|
|
|
die("regexp returned nonsense");
|
|
|
|
|
|
|
|
/* Match beginning must be either beginning of the
|
|
|
|
* line, or at word boundary (i.e. the last char must
|
|
|
|
* not be a word char). Similarly, match end must be
|
|
|
|
* either end of the line, or at word boundary
|
|
|
|
* (i.e. the next char must not be a word char).
|
|
|
|
*/
|
2009-01-10 00:08:40 +01:00
|
|
|
if ( ((pmatch[0].rm_so == 0) ||
|
2006-09-18 01:02:52 +02:00
|
|
|
!word_char(bol[pmatch[0].rm_so-1])) &&
|
|
|
|
((pmatch[0].rm_eo == (eol-bol)) ||
|
|
|
|
!word_char(bol[pmatch[0].rm_eo])) )
|
|
|
|
;
|
|
|
|
else
|
|
|
|
hit = 0;
|
|
|
|
|
2009-06-03 18:19:01 +02:00
|
|
|
/* Words consist of at least one character. */
|
|
|
|
if (pmatch->rm_so == pmatch->rm_eo)
|
|
|
|
hit = 0;
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
if (!hit && pmatch[0].rm_so + bol + 1 < eol) {
|
|
|
|
/* There could be more than one match on the
|
|
|
|
* line, and the first match might not be
|
|
|
|
* strict word match. But later ones could be!
|
2009-01-10 00:08:40 +01:00
|
|
|
* Forward to the next possible start, i.e. the
|
|
|
|
* next position following a non-word char.
|
2006-09-18 01:02:52 +02:00
|
|
|
*/
|
|
|
|
bol = pmatch[0].rm_so + bol + 1;
|
2009-01-10 00:08:40 +01:00
|
|
|
while (word_char(bol[-1]) && bol < eol)
|
|
|
|
bol++;
|
2009-05-23 13:45:26 +02:00
|
|
|
eflags |= REG_NOTBOL;
|
2009-01-10 00:08:40 +01:00
|
|
|
if (bol < eol)
|
|
|
|
goto again;
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
}
|
log --author/--committer: really match only with name part
When we tried to find commits done by AUTHOR, the first implementation
tried to pattern match a line with "^author .*AUTHOR", which later was
enhanced to strip leading caret and look for "^author AUTHOR" when the
search pattern was anchored at the left end (i.e. --author="^AUTHOR").
This had a few problems:
* When looking for fixed strings (e.g. "git log -F --author=x --grep=y"),
the regexp internally used "^author .*x" would never match anything;
* To match at the end (e.g. "git log --author='google.com>$'"), the
generated regexp has to also match the trailing timestamp part the
commit header lines have. Also, in order to determine if the '$' at
the end means "match at the end of the line" or just a literal dollar
sign (probably backslash-quoted), we would need to parse the regexp
ourselves.
An earlier alternative tried to make sure that a line matches "^author "
(to limit by field name) and the user supplied pattern at the same time.
While it solved the -F problem by introducing a special override for
matching the "^author ", it did not solve the trailing timestamp nor tail
match problem. It also would have matched every commit if --author=author
was asked for, not because the author's email part had this string, but
because every commit header line that talks about the author begins with
that field name, regardleses of who wrote it.
Instead of piling more hacks on top of hacks, this rethinks the grep
machinery that is used to look for strings in the commit header, and makes
sure that (1) field name matches literally at the beginning of the line,
followed by a SP, and (2) the user supplied pattern is matched against the
remainder of the line, excluding the trailing timestamp data.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-09-05 07:15:02 +02:00
|
|
|
if (p->token == GREP_PATTERN_HEAD && saved_ch)
|
|
|
|
*eol = saved_ch;
|
2009-05-20 23:31:53 +02:00
|
|
|
if (hit) {
|
|
|
|
pmatch[0].rm_so += bol - start;
|
|
|
|
pmatch[0].rm_eo += bol - start;
|
|
|
|
}
|
2006-09-18 01:02:52 +02:00
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
2009-03-07 13:28:40 +01:00
|
|
|
static int match_expr_eval(struct grep_expr *x, char *bol, char *eol,
|
|
|
|
enum grep_context ctx, int collect_hits)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
2006-09-28 02:50:52 +02:00
|
|
|
int h = 0;
|
2009-03-07 13:30:27 +01:00
|
|
|
regmatch_t match;
|
2006-09-28 02:50:52 +02:00
|
|
|
|
2009-04-27 20:10:24 +02:00
|
|
|
if (!x)
|
|
|
|
die("Not a valid grep expression");
|
2006-09-18 01:02:52 +02:00
|
|
|
switch (x->node) {
|
2010-09-13 07:15:35 +02:00
|
|
|
case GREP_NODE_TRUE:
|
|
|
|
h = 1;
|
|
|
|
break;
|
2006-09-18 01:02:52 +02:00
|
|
|
case GREP_NODE_ATOM:
|
2009-03-07 13:30:27 +01:00
|
|
|
h = match_one_pattern(x->u.atom, bol, eol, ctx, &match, 0);
|
2006-09-18 01:02:52 +02:00
|
|
|
break;
|
|
|
|
case GREP_NODE_NOT:
|
2009-03-07 13:28:40 +01:00
|
|
|
h = !match_expr_eval(x->u.unary, bol, eol, ctx, 0);
|
2006-09-28 02:50:52 +02:00
|
|
|
break;
|
2006-09-18 01:02:52 +02:00
|
|
|
case GREP_NODE_AND:
|
2009-03-07 13:28:40 +01:00
|
|
|
if (!match_expr_eval(x->u.binary.left, bol, eol, ctx, 0))
|
2009-03-07 13:27:15 +01:00
|
|
|
return 0;
|
2009-03-07 13:28:40 +01:00
|
|
|
h = match_expr_eval(x->u.binary.right, bol, eol, ctx, 0);
|
2006-09-28 02:50:52 +02:00
|
|
|
break;
|
2006-09-18 01:02:52 +02:00
|
|
|
case GREP_NODE_OR:
|
2006-09-28 02:50:52 +02:00
|
|
|
if (!collect_hits)
|
2009-03-07 13:28:40 +01:00
|
|
|
return (match_expr_eval(x->u.binary.left,
|
2006-09-28 02:50:52 +02:00
|
|
|
bol, eol, ctx, 0) ||
|
2009-03-07 13:28:40 +01:00
|
|
|
match_expr_eval(x->u.binary.right,
|
2006-09-28 02:50:52 +02:00
|
|
|
bol, eol, ctx, 0));
|
2009-03-07 13:28:40 +01:00
|
|
|
h = match_expr_eval(x->u.binary.left, bol, eol, ctx, 0);
|
2006-09-28 02:50:52 +02:00
|
|
|
x->u.binary.left->hit |= h;
|
2009-03-07 13:28:40 +01:00
|
|
|
h |= match_expr_eval(x->u.binary.right, bol, eol, ctx, 1);
|
2006-09-28 02:50:52 +02:00
|
|
|
break;
|
|
|
|
default:
|
2009-01-04 19:38:41 +01:00
|
|
|
die("Unexpected node type (internal error) %d", x->node);
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
2006-09-28 02:50:52 +02:00
|
|
|
if (collect_hits)
|
|
|
|
x->hit |= h;
|
|
|
|
return h;
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2006-09-20 21:39:46 +02:00
|
|
|
static int match_expr(struct grep_opt *opt, char *bol, char *eol,
|
2006-09-28 02:50:52 +02:00
|
|
|
enum grep_context ctx, int collect_hits)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
|
|
|
struct grep_expr *x = opt->pattern_expression;
|
2009-03-07 13:28:40 +01:00
|
|
|
return match_expr_eval(x, bol, eol, ctx, collect_hits);
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2006-09-20 21:39:46 +02:00
|
|
|
static int match_line(struct grep_opt *opt, char *bol, char *eol,
|
2006-09-28 02:50:52 +02:00
|
|
|
enum grep_context ctx, int collect_hits)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
2009-03-07 13:30:27 +01:00
|
|
|
regmatch_t match;
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
if (opt->extended)
|
2006-09-28 02:50:52 +02:00
|
|
|
return match_expr(opt, bol, eol, ctx, collect_hits);
|
|
|
|
|
|
|
|
/* we do not call with collect_hits without being extended */
|
2006-09-18 01:02:52 +02:00
|
|
|
for (p = opt->pattern_list; p; p = p->next) {
|
2009-03-07 13:30:27 +01:00
|
|
|
if (match_one_pattern(p, bol, eol, ctx, &match, 0))
|
2006-09-18 01:02:52 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-03-07 13:32:32 +01:00
|
|
|
static int match_next_pattern(struct grep_pat *p, char *bol, char *eol,
|
|
|
|
enum grep_context ctx,
|
|
|
|
regmatch_t *pmatch, int eflags)
|
|
|
|
{
|
|
|
|
regmatch_t match;
|
|
|
|
|
|
|
|
if (!match_one_pattern(p, bol, eol, ctx, &match, eflags))
|
|
|
|
return 0;
|
|
|
|
if (match.rm_so < 0 || match.rm_eo < 0)
|
|
|
|
return 0;
|
|
|
|
if (pmatch->rm_so >= 0 && pmatch->rm_eo >= 0) {
|
|
|
|
if (match.rm_so > pmatch->rm_so)
|
|
|
|
return 1;
|
|
|
|
if (match.rm_so == pmatch->rm_so && match.rm_eo < pmatch->rm_eo)
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
pmatch->rm_so = match.rm_so;
|
|
|
|
pmatch->rm_eo = match.rm_eo;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int next_match(struct grep_opt *opt, char *bol, char *eol,
|
|
|
|
enum grep_context ctx, regmatch_t *pmatch, int eflags)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
int hit = 0;
|
|
|
|
|
|
|
|
pmatch->rm_so = pmatch->rm_eo = -1;
|
|
|
|
if (bol < eol) {
|
|
|
|
for (p = opt->pattern_list; p; p = p->next) {
|
|
|
|
switch (p->token) {
|
|
|
|
case GREP_PATTERN: /* atom */
|
|
|
|
case GREP_PATTERN_HEAD:
|
|
|
|
case GREP_PATTERN_BODY:
|
|
|
|
hit |= match_next_pattern(p, bol, eol, ctx,
|
|
|
|
pmatch, eflags);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void show_line(struct grep_opt *opt, char *bol, char *eol,
|
|
|
|
const char *name, unsigned lno, char sign)
|
|
|
|
{
|
|
|
|
int rest = eol - bol;
|
2014-10-27 19:23:05 +01:00
|
|
|
const char *match_color, *line_color = NULL;
|
2009-03-07 13:32:32 +01:00
|
|
|
|
2011-06-05 17:24:25 +02:00
|
|
|
if (opt->file_break && opt->last_shown == 0) {
|
|
|
|
if (opt->show_hunk_mark)
|
|
|
|
opt->output(opt, "\n", 1);
|
2011-08-01 19:20:53 +02:00
|
|
|
} else if (opt->pre_context || opt->post_context || opt->funcbody) {
|
2009-07-02 00:03:44 +02:00
|
|
|
if (opt->last_shown == 0) {
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
if (opt->show_hunk_mark) {
|
|
|
|
output_color(opt, "--", 2, opt->color_sep);
|
|
|
|
opt->output(opt, "\n", 1);
|
2010-04-03 21:28:39 +02:00
|
|
|
}
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
} else if (lno > opt->last_shown + 1) {
|
|
|
|
output_color(opt, "--", 2, opt->color_sep);
|
|
|
|
opt->output(opt, "\n", 1);
|
|
|
|
}
|
2009-07-02 00:02:38 +02:00
|
|
|
}
|
2011-06-05 17:24:36 +02:00
|
|
|
if (opt->heading && opt->last_shown == 0) {
|
|
|
|
output_color(opt, name, strlen(name), opt->color_filename);
|
|
|
|
opt->output(opt, "\n", 1);
|
|
|
|
}
|
2009-07-02 00:02:38 +02:00
|
|
|
opt->last_shown = lno;
|
|
|
|
|
2011-06-05 17:24:36 +02:00
|
|
|
if (!opt->heading && opt->pathname) {
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
output_color(opt, name, strlen(name), opt->color_filename);
|
|
|
|
output_sep(opt, sign);
|
2010-01-25 23:51:39 +01:00
|
|
|
}
|
|
|
|
if (opt->linenum) {
|
|
|
|
char buf[32];
|
2017-03-28 21:46:56 +02:00
|
|
|
xsnprintf(buf, sizeof(buf), "%d", lno);
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
output_color(opt, buf, strlen(buf), opt->color_lineno);
|
|
|
|
output_sep(opt, sign);
|
2010-01-25 23:51:39 +01:00
|
|
|
}
|
2009-03-07 13:32:32 +01:00
|
|
|
if (opt->color) {
|
|
|
|
regmatch_t match;
|
|
|
|
enum grep_context ctx = GREP_CONTEXT_BODY;
|
|
|
|
int ch = *eol;
|
|
|
|
int eflags = 0;
|
|
|
|
|
2014-10-27 19:23:05 +01:00
|
|
|
if (sign == ':')
|
|
|
|
match_color = opt->color_match_selected;
|
|
|
|
else
|
|
|
|
match_color = opt->color_match_context;
|
2010-03-07 17:52:47 +01:00
|
|
|
if (sign == ':')
|
|
|
|
line_color = opt->color_selected;
|
|
|
|
else if (sign == '-')
|
|
|
|
line_color = opt->color_context;
|
|
|
|
else if (sign == '=')
|
|
|
|
line_color = opt->color_function;
|
2009-03-07 13:32:32 +01:00
|
|
|
*eol = '\0';
|
|
|
|
while (next_match(opt, bol, eol, ctx, &match, eflags)) {
|
2009-06-01 23:53:05 +02:00
|
|
|
if (match.rm_so == match.rm_eo)
|
|
|
|
break;
|
2010-01-25 23:51:39 +01:00
|
|
|
|
2010-03-07 17:52:47 +01:00
|
|
|
output_color(opt, bol, match.rm_so, line_color);
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
output_color(opt, bol + match.rm_so,
|
2014-10-27 19:23:05 +01:00
|
|
|
match.rm_eo - match.rm_so, match_color);
|
2009-03-07 13:32:32 +01:00
|
|
|
bol += match.rm_eo;
|
|
|
|
rest -= match.rm_eo;
|
|
|
|
eflags = REG_NOTBOL;
|
|
|
|
}
|
|
|
|
*eol = ch;
|
|
|
|
}
|
2010-03-07 17:52:47 +01:00
|
|
|
output_color(opt, bol, rest, line_color);
|
2010-01-25 23:51:39 +01:00
|
|
|
opt->output(opt, "\n", 1);
|
2009-03-07 13:32:32 +01:00
|
|
|
}
|
|
|
|
|
2011-12-12 22:16:07 +01:00
|
|
|
#ifndef NO_PTHREADS
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
int grep_use_locks;
|
|
|
|
|
2011-12-12 22:16:07 +01:00
|
|
|
/*
|
|
|
|
* This lock protects access to the gitattributes machinery, which is
|
|
|
|
* not thread-safe.
|
|
|
|
*/
|
|
|
|
pthread_mutex_t grep_attr_mutex;
|
|
|
|
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
static inline void grep_attr_lock(void)
|
2011-12-12 22:16:07 +01:00
|
|
|
{
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
if (grep_use_locks)
|
2011-12-12 22:16:07 +01:00
|
|
|
pthread_mutex_lock(&grep_attr_mutex);
|
|
|
|
}
|
|
|
|
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
static inline void grep_attr_unlock(void)
|
2011-12-12 22:16:07 +01:00
|
|
|
{
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
if (grep_use_locks)
|
2011-12-12 22:16:07 +01:00
|
|
|
pthread_mutex_unlock(&grep_attr_mutex);
|
|
|
|
}
|
2012-02-02 09:18:41 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Same as git_attr_mutex, but protecting the thread-unsafe object db access.
|
|
|
|
*/
|
|
|
|
pthread_mutex_t grep_read_mutex;
|
|
|
|
|
2011-12-12 22:16:07 +01:00
|
|
|
#else
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:18:29 +01:00
|
|
|
#define grep_attr_lock()
|
|
|
|
#define grep_attr_unlock()
|
2011-12-12 22:16:07 +01:00
|
|
|
#endif
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
static int match_funcname(struct grep_opt *opt, struct grep_source *gs, char *bol, char *eol)
|
2009-07-02 00:06:34 +02:00
|
|
|
{
|
2009-07-02 00:07:24 +02:00
|
|
|
xdemitconf_t *xecfg = opt->priv;
|
2011-12-12 22:16:07 +01:00
|
|
|
if (xecfg && !xecfg->find_func) {
|
2012-02-02 09:20:43 +01:00
|
|
|
grep_source_load_driver(gs);
|
|
|
|
if (gs->driver->funcname.pattern) {
|
|
|
|
const struct userdiff_funcname *pe = &gs->driver->funcname;
|
2011-12-12 22:16:07 +01:00
|
|
|
xdiff_set_find_func(xecfg, pe->pattern, pe->cflags);
|
|
|
|
} else {
|
|
|
|
xecfg = opt->priv = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (xecfg) {
|
2009-07-02 00:07:24 +02:00
|
|
|
char buf[1];
|
|
|
|
return xecfg->find_func(bol, eol - bol, buf, 1,
|
|
|
|
xecfg->find_func_priv) >= 0;
|
|
|
|
}
|
|
|
|
|
2009-07-02 00:06:34 +02:00
|
|
|
if (bol == eol)
|
|
|
|
return 0;
|
|
|
|
if (isalpha(*bol) || *bol == '_' || *bol == '$')
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
static void show_funcname_line(struct grep_opt *opt, struct grep_source *gs,
|
|
|
|
char *bol, unsigned lno)
|
2009-07-02 00:06:34 +02:00
|
|
|
{
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
while (bol > gs->buf) {
|
2009-07-02 00:06:34 +02:00
|
|
|
char *eol = --bol;
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
while (bol > gs->buf && bol[-1] != '\n')
|
2009-07-02 00:06:34 +02:00
|
|
|
bol--;
|
|
|
|
lno--;
|
|
|
|
|
|
|
|
if (lno <= opt->last_shown)
|
|
|
|
break;
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
if (match_funcname(opt, gs, bol, eol)) {
|
|
|
|
show_line(opt, bol, eol, gs->name, lno, '=');
|
2009-07-02 00:06:34 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
static void show_pre_context(struct grep_opt *opt, struct grep_source *gs,
|
2011-08-01 19:20:53 +02:00
|
|
|
char *bol, char *end, unsigned lno)
|
2009-07-02 00:05:17 +02:00
|
|
|
{
|
2009-07-02 00:06:34 +02:00
|
|
|
unsigned cur = lno, from = 1, funcname_lno = 0;
|
2011-08-01 19:20:53 +02:00
|
|
|
int funcname_needed = !!opt->funcname;
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
if (opt->funcbody && !match_funcname(opt, gs, bol, end))
|
2011-08-01 19:20:53 +02:00
|
|
|
funcname_needed = 2;
|
2009-07-02 00:05:17 +02:00
|
|
|
|
|
|
|
if (opt->pre_context < lno)
|
|
|
|
from = lno - opt->pre_context;
|
|
|
|
if (from <= opt->last_shown)
|
|
|
|
from = opt->last_shown + 1;
|
|
|
|
|
|
|
|
/* Rewind. */
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
while (bol > gs->buf &&
|
2011-08-01 19:20:53 +02:00
|
|
|
cur > (funcname_needed == 2 ? opt->last_shown + 1 : from)) {
|
2009-07-02 00:06:34 +02:00
|
|
|
char *eol = --bol;
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
while (bol > gs->buf && bol[-1] != '\n')
|
2009-07-02 00:05:17 +02:00
|
|
|
bol--;
|
|
|
|
cur--;
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
if (funcname_needed && match_funcname(opt, gs, bol, eol)) {
|
2009-07-02 00:06:34 +02:00
|
|
|
funcname_lno = cur;
|
|
|
|
funcname_needed = 0;
|
|
|
|
}
|
2009-07-02 00:05:17 +02:00
|
|
|
}
|
|
|
|
|
2009-07-02 00:06:34 +02:00
|
|
|
/* We need to look even further back to find a function signature. */
|
|
|
|
if (opt->funcname && funcname_needed)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_funcname_line(opt, gs, bol, cur);
|
2009-07-02 00:06:34 +02:00
|
|
|
|
2009-07-02 00:05:17 +02:00
|
|
|
/* Back forward. */
|
|
|
|
while (cur < lno) {
|
2009-07-02 00:06:34 +02:00
|
|
|
char *eol = bol, sign = (cur == funcname_lno) ? '=' : '-';
|
2009-07-02 00:05:17 +02:00
|
|
|
|
|
|
|
while (*eol != '\n')
|
|
|
|
eol++;
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_line(opt, bol, eol, gs->name, cur, sign);
|
2009-07-02 00:05:17 +02:00
|
|
|
bol = eol + 1;
|
|
|
|
cur++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-01-11 07:39:36 +01:00
|
|
|
static int should_lookahead(struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
struct grep_pat *p;
|
|
|
|
|
|
|
|
if (opt->extended)
|
|
|
|
return 0; /* punt for too complex stuff */
|
|
|
|
if (opt->invert)
|
|
|
|
return 0;
|
|
|
|
for (p = opt->pattern_list; p; p = p->next) {
|
|
|
|
if (p->token != GREP_PATTERN)
|
|
|
|
return 0; /* punt for "header only" and stuff */
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int look_ahead(struct grep_opt *opt,
|
|
|
|
unsigned long *left_p,
|
|
|
|
unsigned *lno_p,
|
|
|
|
char **bol_p)
|
|
|
|
{
|
|
|
|
unsigned lno = *lno_p;
|
|
|
|
char *bol = *bol_p;
|
|
|
|
struct grep_pat *p;
|
|
|
|
char *sp, *last_bol;
|
|
|
|
regoff_t earliest = -1;
|
|
|
|
|
|
|
|
for (p = opt->pattern_list; p; p = p->next) {
|
|
|
|
int hit;
|
|
|
|
regmatch_t m;
|
|
|
|
|
2011-05-05 00:00:19 +02:00
|
|
|
hit = patmatch(p, bol, bol + *left_p, &m, 0);
|
2010-01-11 07:39:36 +01:00
|
|
|
if (!hit || m.rm_so < 0 || m.rm_eo < 0)
|
|
|
|
continue;
|
|
|
|
if (earliest < 0 || m.rm_so < earliest)
|
|
|
|
earliest = m.rm_so;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (earliest < 0) {
|
|
|
|
*bol_p = bol + *left_p;
|
|
|
|
*left_p = 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
for (sp = bol + earliest; bol < sp && sp[-1] != '\n'; sp--)
|
|
|
|
; /* find the beginning of the line */
|
|
|
|
last_bol = sp;
|
|
|
|
|
|
|
|
for (sp = bol; sp < last_bol; sp++) {
|
|
|
|
if (*sp == '\n')
|
|
|
|
lno++;
|
|
|
|
}
|
|
|
|
*left_p -= last_bol - bol;
|
|
|
|
*bol_p = last_bol;
|
|
|
|
*lno_p = lno;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-05-10 17:10:15 +02:00
|
|
|
static int fill_textconv_grep(struct userdiff_driver *driver,
|
|
|
|
struct grep_source *gs)
|
|
|
|
{
|
|
|
|
struct diff_filespec *df;
|
|
|
|
char *buf;
|
|
|
|
size_t size;
|
|
|
|
|
|
|
|
if (!driver || !driver->textconv)
|
|
|
|
return grep_source_load(gs);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The textconv interface is intimately tied to diff_filespecs, so we
|
|
|
|
* have to pretend to be one. If we could unify the grep_source
|
|
|
|
* and diff_filespec structs, this mess could just go away.
|
|
|
|
*/
|
|
|
|
df = alloc_filespec(gs->path);
|
|
|
|
switch (gs->type) {
|
2017-05-30 19:30:44 +02:00
|
|
|
case GREP_SOURCE_OID:
|
2013-05-10 17:10:15 +02:00
|
|
|
fill_filespec(df, gs->identifier, 1, 0100644);
|
|
|
|
break;
|
|
|
|
case GREP_SOURCE_FILE:
|
2017-05-30 19:30:50 +02:00
|
|
|
fill_filespec(df, &null_oid, 0, 0100644);
|
2013-05-10 17:10:15 +02:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
die("BUG: attempt to textconv something without a path?");
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* fill_textconv is not remotely thread-safe; it may load objects
|
|
|
|
* behind the scenes, and it modifies the global diff tempfile
|
|
|
|
* structure.
|
|
|
|
*/
|
|
|
|
grep_read_lock();
|
|
|
|
size = fill_textconv(driver, df, &buf);
|
|
|
|
grep_read_unlock();
|
|
|
|
free_filespec(df);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The normal fill_textconv usage by the diff machinery would just keep
|
|
|
|
* the textconv'd buf separate from the diff_filespec. But much of the
|
|
|
|
* grep code passes around a grep_source and assumes that its "buf"
|
|
|
|
* pointer is the beginning of the thing we are searching. So let's
|
|
|
|
* install our textconv'd version into the grep_source, taking care not
|
|
|
|
* to leak any existing buffer.
|
|
|
|
*/
|
|
|
|
grep_source_clear_data(gs);
|
|
|
|
gs->buf = buf;
|
|
|
|
gs->size = size;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-28 17:06:19 +02:00
|
|
|
static int is_empty_line(const char *bol, const char *eol)
|
|
|
|
{
|
|
|
|
while (bol < eol && isspace(*bol))
|
|
|
|
bol++;
|
|
|
|
return bol == eol;
|
|
|
|
}
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int collect_hits)
|
2006-09-18 01:02:52 +02:00
|
|
|
{
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
char *bol;
|
2016-05-28 17:06:19 +02:00
|
|
|
char *peek_bol = NULL;
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
unsigned long left;
|
2006-09-18 01:02:52 +02:00
|
|
|
unsigned lno = 1;
|
|
|
|
unsigned last_hit = 0;
|
|
|
|
int binary_match_only = 0;
|
|
|
|
unsigned count = 0;
|
2010-01-11 07:39:36 +01:00
|
|
|
int try_lookahead = 0;
|
2011-08-01 19:20:53 +02:00
|
|
|
int show_function = 0;
|
2013-05-10 17:10:15 +02:00
|
|
|
struct userdiff_driver *textconv = NULL;
|
2006-09-20 21:39:46 +02:00
|
|
|
enum grep_context ctx = GREP_CONTEXT_HEAD;
|
2009-07-02 00:07:24 +02:00
|
|
|
xdemitconf_t xecfg;
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2010-01-25 23:51:39 +01:00
|
|
|
if (!opt->output)
|
|
|
|
opt->output = std_output;
|
|
|
|
|
2011-08-01 19:20:53 +02:00
|
|
|
if (opt->pre_context || opt->post_context || opt->file_break ||
|
|
|
|
opt->funcbody) {
|
2011-06-05 17:24:15 +02:00
|
|
|
/* Show hunk marks, except for the first file. */
|
|
|
|
if (opt->last_shown)
|
|
|
|
opt->show_hunk_mark = 1;
|
|
|
|
/*
|
|
|
|
* If we're using threads then we can't easily identify
|
|
|
|
* the first file. Always put hunk marks in that case
|
|
|
|
* and skip the very first one later in work_done().
|
|
|
|
*/
|
|
|
|
if (opt->output != std_output)
|
|
|
|
opt->show_hunk_mark = 1;
|
|
|
|
}
|
2010-03-15 17:21:10 +01:00
|
|
|
opt->last_shown = 0;
|
|
|
|
|
2013-05-10 17:10:15 +02:00
|
|
|
if (opt->allow_textconv) {
|
|
|
|
grep_source_load_driver(gs);
|
|
|
|
/*
|
|
|
|
* We might set up the shared textconv cache data here, which
|
|
|
|
* is not thread-safe.
|
|
|
|
*/
|
|
|
|
grep_attr_lock();
|
|
|
|
textconv = userdiff_get_textconv(gs->driver);
|
|
|
|
grep_attr_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We know the result of a textconv is text, so we only have to care
|
|
|
|
* about binary handling if we are not using it.
|
|
|
|
*/
|
|
|
|
if (!textconv) {
|
|
|
|
switch (opt->binary) {
|
|
|
|
case GREP_BINARY_DEFAULT:
|
|
|
|
if (grep_source_is_binary(gs))
|
|
|
|
binary_match_only = 1;
|
|
|
|
break;
|
|
|
|
case GREP_BINARY_NOMATCH:
|
|
|
|
if (grep_source_is_binary(gs))
|
|
|
|
return 0; /* Assume unmatch */
|
|
|
|
break;
|
|
|
|
case GREP_BINARY_TEXT:
|
|
|
|
break;
|
|
|
|
default:
|
2016-07-26 18:05:50 +02:00
|
|
|
die("BUG: unknown binary handling mode");
|
2013-05-10 17:10:15 +02:00
|
|
|
}
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
2009-07-02 00:07:24 +02:00
|
|
|
memset(&xecfg, 0, sizeof(xecfg));
|
2011-12-12 22:16:07 +01:00
|
|
|
opt->priv = &xecfg;
|
|
|
|
|
2010-01-11 07:39:36 +01:00
|
|
|
try_lookahead = should_lookahead(opt);
|
2009-07-02 00:07:24 +02:00
|
|
|
|
2013-05-10 17:10:15 +02:00
|
|
|
if (fill_textconv_grep(textconv, gs) < 0)
|
2012-02-02 09:21:11 +01:00
|
|
|
return 0;
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
bol = gs->buf;
|
|
|
|
left = gs->size;
|
2006-09-18 01:02:52 +02:00
|
|
|
while (left) {
|
|
|
|
char *eol, ch;
|
2006-09-28 02:50:52 +02:00
|
|
|
int hit;
|
2006-09-18 01:02:52 +02:00
|
|
|
|
2010-01-11 07:39:36 +01:00
|
|
|
/*
|
2011-05-09 23:52:03 +02:00
|
|
|
* look_ahead() skips quickly to the line that possibly
|
2010-01-11 07:39:36 +01:00
|
|
|
* has the next hit; don't call it if we need to do
|
|
|
|
* something more than just skipping the current line
|
|
|
|
* in response to an unmatch for the current line. E.g.
|
|
|
|
* inside a post-context window, we will show the current
|
|
|
|
* line as a context around the previous hit when it
|
|
|
|
* doesn't hit.
|
|
|
|
*/
|
|
|
|
if (try_lookahead
|
|
|
|
&& !(last_hit
|
2011-08-01 19:20:53 +02:00
|
|
|
&& (show_function ||
|
|
|
|
lno <= last_hit + opt->post_context))
|
2010-01-11 07:39:36 +01:00
|
|
|
&& look_ahead(opt, &left, &lno, &bol))
|
|
|
|
break;
|
2006-09-18 01:02:52 +02:00
|
|
|
eol = end_of_line(bol, &left);
|
|
|
|
ch = *eol;
|
|
|
|
*eol = 0;
|
|
|
|
|
2006-09-20 21:39:46 +02:00
|
|
|
if ((ctx == GREP_CONTEXT_HEAD) && (eol == bol))
|
|
|
|
ctx = GREP_CONTEXT_BODY;
|
|
|
|
|
2006-09-28 02:50:52 +02:00
|
|
|
hit = match_line(opt, bol, eol, ctx, collect_hits);
|
2006-09-18 01:02:52 +02:00
|
|
|
*eol = ch;
|
|
|
|
|
2006-09-28 02:50:52 +02:00
|
|
|
if (collect_hits)
|
|
|
|
goto next_line;
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
/* "grep -v -e foo -e bla" should list lines
|
|
|
|
* that do not have either, so inversion should
|
|
|
|
* be done outside.
|
|
|
|
*/
|
|
|
|
if (opt->invert)
|
|
|
|
hit = !hit;
|
|
|
|
if (opt->unmatch_name_only) {
|
|
|
|
if (hit)
|
|
|
|
return 0;
|
|
|
|
goto next_line;
|
|
|
|
}
|
|
|
|
if (hit) {
|
|
|
|
count++;
|
|
|
|
if (opt->status_only)
|
|
|
|
return 1;
|
2010-05-22 23:30:48 +02:00
|
|
|
if (opt->name_only) {
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_name(opt, gs->name);
|
2010-05-22 23:30:48 +02:00
|
|
|
return 1;
|
|
|
|
}
|
2010-05-22 23:29:35 +02:00
|
|
|
if (opt->count)
|
|
|
|
goto next_line;
|
2006-09-18 01:02:52 +02:00
|
|
|
if (binary_match_only) {
|
2010-01-25 23:51:39 +01:00
|
|
|
opt->output(opt, "Binary file ", 12);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
output_color(opt, gs->name, strlen(gs->name),
|
grep: Colorize filename, line number, and separator
Colorize the filename, line number, and separator in git grep output, as
GNU grep does. The colors are customizable through color.grep.<slot>.
The default is to only color the separator (in cyan), since this gives
the biggest legibility increase without overwhelming the user with
colors. GNU grep also defaults cyan for the separator, but defaults to
magenta for the filename and to green for the line number, as well.
There is one difference from GNU grep: When a binary file matches
without -a, GNU grep does not color the <file> in "Binary file <file>
matches", but we do.
Like GNU grep, if --null is given, the null separators are not colored.
For config.txt, use a a sub-list to describe the slots, rather than
a single paragraph with parentheses, since this is much more readable.
Remove the cast to int for `rm_eo - rm_so` since it is not necessary.
Signed-off-by: Mark Lodato <lodatom@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-03-07 17:52:46 +01:00
|
|
|
opt->color_filename);
|
2010-01-25 23:51:39 +01:00
|
|
|
opt->output(opt, " matches\n", 9);
|
2006-09-18 01:02:52 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
/* Hit at this line. If we haven't shown the
|
|
|
|
* pre-context lines, we would need to show them.
|
|
|
|
*/
|
2011-08-01 19:20:53 +02:00
|
|
|
if (opt->pre_context || opt->funcbody)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_pre_context(opt, gs, bol, eol, lno);
|
2009-07-02 00:06:34 +02:00
|
|
|
else if (opt->funcname)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_funcname_line(opt, gs, bol, lno);
|
|
|
|
show_line(opt, bol, eol, gs->name, lno, ':');
|
2009-07-02 00:02:38 +02:00
|
|
|
last_hit = lno;
|
2011-08-01 19:20:53 +02:00
|
|
|
if (opt->funcbody)
|
|
|
|
show_function = 1;
|
|
|
|
goto next_line;
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
2016-05-28 17:06:19 +02:00
|
|
|
if (show_function && (!peek_bol || peek_bol < bol)) {
|
|
|
|
unsigned long peek_left = left;
|
|
|
|
char *peek_eol = eol;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Trailing empty lines are not interesting.
|
|
|
|
* Peek past them to see if they belong to the
|
|
|
|
* body of the current function.
|
|
|
|
*/
|
|
|
|
peek_bol = bol;
|
|
|
|
while (is_empty_line(peek_bol, peek_eol)) {
|
|
|
|
peek_bol = peek_eol + 1;
|
|
|
|
peek_eol = end_of_line(peek_bol, &peek_left);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (match_funcname(opt, gs, peek_bol, peek_eol))
|
|
|
|
show_function = 0;
|
|
|
|
}
|
2011-08-01 19:20:53 +02:00
|
|
|
if (show_function ||
|
|
|
|
(last_hit && lno <= last_hit + opt->post_context)) {
|
2006-09-18 01:02:52 +02:00
|
|
|
/* If the last hit is within the post context,
|
|
|
|
* we need to show this line.
|
|
|
|
*/
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_line(opt, bol, eol, gs->name, lno, '-');
|
2006-09-18 01:02:52 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
next_line:
|
|
|
|
bol = eol + 1;
|
|
|
|
if (!left)
|
|
|
|
break;
|
|
|
|
left--;
|
|
|
|
lno++;
|
|
|
|
}
|
|
|
|
|
2006-09-28 02:50:52 +02:00
|
|
|
if (collect_hits)
|
|
|
|
return 0;
|
2006-09-28 01:27:10 +02:00
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
if (opt->status_only)
|
|
|
|
return 0;
|
|
|
|
if (opt->unmatch_name_only) {
|
|
|
|
/* We did not see any hit, so we want to show this */
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
show_name(opt, gs->name);
|
2006-09-18 01:02:52 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-07-02 00:07:24 +02:00
|
|
|
xdiff_clear_find_func(&xecfg);
|
|
|
|
opt->priv = NULL;
|
|
|
|
|
2006-09-18 01:02:52 +02:00
|
|
|
/* NEEDSWORK:
|
|
|
|
* The real "grep -c foo *.c" gives many "bar.c:0" lines,
|
|
|
|
* which feels mostly useless but sometimes useful. Maybe
|
|
|
|
* make it another option? For now suppress them.
|
|
|
|
*/
|
2010-01-25 23:51:39 +01:00
|
|
|
if (opt->count && count) {
|
|
|
|
char buf[32];
|
2014-03-11 22:15:49 +01:00
|
|
|
if (opt->pathname) {
|
|
|
|
output_color(opt, gs->name, strlen(gs->name),
|
|
|
|
opt->color_filename);
|
|
|
|
output_sep(opt, ':');
|
|
|
|
}
|
2017-03-28 21:46:56 +02:00
|
|
|
xsnprintf(buf, sizeof(buf), "%u\n", count);
|
2010-01-25 23:51:39 +01:00
|
|
|
opt->output(opt, buf, strlen(buf));
|
2010-05-22 23:29:35 +02:00
|
|
|
return 1;
|
2010-01-25 23:51:39 +01:00
|
|
|
}
|
2006-09-18 01:02:52 +02:00
|
|
|
return !!last_hit;
|
|
|
|
}
|
|
|
|
|
2006-09-28 02:50:52 +02:00
|
|
|
static void clr_hit_marker(struct grep_expr *x)
|
|
|
|
{
|
|
|
|
/* All-hit markers are meaningful only at the very top level
|
|
|
|
* OR node.
|
|
|
|
*/
|
|
|
|
while (1) {
|
|
|
|
x->hit = 0;
|
|
|
|
if (x->node != GREP_NODE_OR)
|
|
|
|
return;
|
|
|
|
x->u.binary.left->hit = 0;
|
|
|
|
x = x->u.binary.right;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int chk_hit_marker(struct grep_expr *x)
|
|
|
|
{
|
|
|
|
/* Top level nodes have hit markers. See if they all are hits */
|
|
|
|
while (1) {
|
|
|
|
if (x->node != GREP_NODE_OR)
|
|
|
|
return x->hit;
|
|
|
|
if (!x->u.binary.left->hit)
|
|
|
|
return 0;
|
|
|
|
x = x->u.binary.right;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
int grep_source(struct grep_opt *opt, struct grep_source *gs)
|
2006-09-28 02:50:52 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* we do not have to do the two-pass grep when we do not check
|
|
|
|
* buffer-wide "all-match".
|
|
|
|
*/
|
|
|
|
if (!opt->all_match)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
return grep_source_1(opt, gs, 0);
|
2006-09-28 02:50:52 +02:00
|
|
|
|
|
|
|
/* Otherwise the toplevel "or" terms hit a bit differently.
|
|
|
|
* We first clear hit markers from them.
|
|
|
|
*/
|
|
|
|
clr_hit_marker(opt->pattern_expression);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
grep_source_1(opt, gs, 1);
|
2006-09-28 02:50:52 +02:00
|
|
|
|
|
|
|
if (!chk_hit_marker(opt->pattern_expression))
|
|
|
|
return 0;
|
|
|
|
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
return grep_source_1(opt, gs, 0);
|
|
|
|
}
|
|
|
|
|
2012-02-02 09:20:10 +01:00
|
|
|
int grep_buffer(struct grep_opt *opt, char *buf, unsigned long size)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
{
|
|
|
|
struct grep_source gs;
|
|
|
|
int r;
|
|
|
|
|
2012-10-12 12:49:38 +02:00
|
|
|
grep_source_init(&gs, GREP_SOURCE_BUF, NULL, NULL, NULL);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
gs.buf = buf;
|
|
|
|
gs.size = size;
|
|
|
|
|
|
|
|
r = grep_source(opt, &gs);
|
|
|
|
|
|
|
|
grep_source_clear(&gs);
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
void grep_source_init(struct grep_source *gs, enum grep_source_type type,
|
2012-10-12 12:49:38 +02:00
|
|
|
const char *name, const char *path,
|
|
|
|
const void *identifier)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
{
|
|
|
|
gs->type = type;
|
2015-01-13 02:59:09 +01:00
|
|
|
gs->name = xstrdup_or_null(name);
|
|
|
|
gs->path = xstrdup_or_null(path);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
gs->buf = NULL;
|
|
|
|
gs->size = 0;
|
2012-02-02 09:20:43 +01:00
|
|
|
gs->driver = NULL;
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case GREP_SOURCE_FILE:
|
|
|
|
gs->identifier = xstrdup(identifier);
|
|
|
|
break;
|
2017-05-30 19:30:44 +02:00
|
|
|
case GREP_SOURCE_OID:
|
|
|
|
gs->identifier = oiddup(identifier);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
break;
|
|
|
|
case GREP_SOURCE_BUF:
|
|
|
|
gs->identifier = NULL;
|
2016-12-16 20:03:19 +01:00
|
|
|
break;
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void grep_source_clear(struct grep_source *gs)
|
|
|
|
{
|
2017-06-16 01:15:49 +02:00
|
|
|
FREE_AND_NULL(gs->name);
|
|
|
|
FREE_AND_NULL(gs->path);
|
|
|
|
FREE_AND_NULL(gs->identifier);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
grep_source_clear_data(gs);
|
|
|
|
}
|
|
|
|
|
|
|
|
void grep_source_clear_data(struct grep_source *gs)
|
|
|
|
{
|
|
|
|
switch (gs->type) {
|
|
|
|
case GREP_SOURCE_FILE:
|
2017-05-30 19:30:44 +02:00
|
|
|
case GREP_SOURCE_OID:
|
2017-06-16 01:15:46 +02:00
|
|
|
FREE_AND_NULL(gs->buf);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
gs->size = 0;
|
|
|
|
break;
|
|
|
|
case GREP_SOURCE_BUF:
|
|
|
|
/* leave user-provided buf intact */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-30 19:30:44 +02:00
|
|
|
static int grep_source_load_oid(struct grep_source *gs)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
{
|
|
|
|
enum object_type type;
|
|
|
|
|
|
|
|
grep_read_lock();
|
|
|
|
gs->buf = read_sha1_file(gs->identifier, &type, &gs->size);
|
|
|
|
grep_read_unlock();
|
|
|
|
|
|
|
|
if (!gs->buf)
|
|
|
|
return error(_("'%s': unable to read %s"),
|
|
|
|
gs->name,
|
2017-05-30 19:30:44 +02:00
|
|
|
oid_to_hex(gs->identifier));
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int grep_source_load_file(struct grep_source *gs)
|
|
|
|
{
|
|
|
|
const char *filename = gs->identifier;
|
|
|
|
struct stat st;
|
|
|
|
char *data;
|
|
|
|
size_t size;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (lstat(filename, &st) < 0) {
|
|
|
|
err_ret:
|
|
|
|
if (errno != ENOENT)
|
2016-05-08 11:47:47 +02:00
|
|
|
error_errno(_("failed to stat '%s'"), filename);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!S_ISREG(st.st_mode))
|
|
|
|
return -1;
|
|
|
|
size = xsize_t(st.st_size);
|
|
|
|
i = open(filename, O_RDONLY);
|
|
|
|
if (i < 0)
|
|
|
|
goto err_ret;
|
2016-02-22 23:44:28 +01:00
|
|
|
data = xmallocz(size);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
if (st.st_size != read_in_full(i, data, size)) {
|
2016-05-08 11:47:47 +02:00
|
|
|
error_errno(_("'%s': short read"), filename);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
close(i);
|
|
|
|
free(data);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
close(i);
|
|
|
|
|
|
|
|
gs->buf = data;
|
|
|
|
gs->size = size;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-09-20 23:20:09 +02:00
|
|
|
static int grep_source_load(struct grep_source *gs)
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
{
|
|
|
|
if (gs->buf)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
switch (gs->type) {
|
|
|
|
case GREP_SOURCE_FILE:
|
|
|
|
return grep_source_load_file(gs);
|
2017-05-30 19:30:44 +02:00
|
|
|
case GREP_SOURCE_OID:
|
|
|
|
return grep_source_load_oid(gs);
|
grep: refactor the concept of "grep source" into an object
The main interface to the low-level grep code is
grep_buffer, which takes a pointer to a buffer and a size.
This is convenient and flexible (we use it to grep commit
bodies, files on disk, and blobs by sha1), but it makes it
hard to pass extra information about what we are grepping
(either for correctness, like overriding binary
auto-detection, or for optimizations, like lazily loading
blob contents).
Instead, let's encapsulate the idea of a "grep source",
including the buffer, its size, and where the data is coming
from. This is similar to the diff_filespec structure used by
the diff code (unsurprising, since future patches will
implement some of the same optimizations found there).
The diffstat is slightly scarier than the actual patch
content. Most of the modified lines are simply replacing
access to raw variables with their counterparts that are now
in a "struct grep_source". Most of the added lines were
taken from builtin/grep.c, which partially abstracted the
idea of grep sources (for file vs sha1 sources).
Instead of dropping the now-redundant code, this patch
leaves builtin/grep.c using the traditional grep_buffer
interface (which now wraps the grep_source interface). That
makes it easy to test that there is no change of behavior
(yet).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 09:19:28 +01:00
|
|
|
case GREP_SOURCE_BUF:
|
|
|
|
return gs->buf ? 0 : -1;
|
|
|
|
}
|
2016-12-16 20:03:19 +01:00
|
|
|
die("BUG: invalid grep_source type to load");
|
2006-09-28 02:50:52 +02:00
|
|
|
}
|
2012-02-02 09:20:43 +01:00
|
|
|
|
|
|
|
void grep_source_load_driver(struct grep_source *gs)
|
|
|
|
{
|
|
|
|
if (gs->driver)
|
|
|
|
return;
|
|
|
|
|
|
|
|
grep_attr_lock();
|
2012-10-12 12:49:38 +02:00
|
|
|
if (gs->path)
|
|
|
|
gs->driver = userdiff_find_by_path(gs->path);
|
2012-02-02 09:20:43 +01:00
|
|
|
if (!gs->driver)
|
|
|
|
gs->driver = userdiff_find_by_name("default");
|
|
|
|
grep_attr_unlock();
|
|
|
|
}
|
2012-02-02 09:21:02 +01:00
|
|
|
|
2012-09-20 23:20:09 +02:00
|
|
|
static int grep_source_is_binary(struct grep_source *gs)
|
2012-02-02 09:21:02 +01:00
|
|
|
{
|
|
|
|
grep_source_load_driver(gs);
|
|
|
|
if (gs->driver->binary != -1)
|
|
|
|
return gs->driver->binary;
|
|
|
|
|
|
|
|
if (!grep_source_load(gs))
|
|
|
|
return buffer_is_binary(gs->buf, gs->size);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|