Skip to content

Commit 4158734

Browse files
[INS-285] Fix custom detectors line number reporting to match the full regex instead of capture group (#4697)
* set the full match as primary secret for correct line number reporting * add test case for multiline, update documentation * update comment * add test case to engine * sort regexes before selecting first for deterministic behaviour * remove leftover todo comment --------- Co-authored-by: Kashif Khan <70996046+kashifkhan0771@users.noreply.github.com>
1 parent e9734c1 commit 4158734

File tree

4 files changed

+118
-3
lines changed

4 files changed

+118
-3
lines changed

pkg/custom_detectors/CUSTOM_DETECTORS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ This guide will walk you through setting up a custom detector in TruffleHog to i
3838
- **`verify`**: An optional section to validate detected secrets. If you want to verify or unverify detected secrets, this section needs to be configured. If not configured, all detected secrets will be marked as unverified. Read [verification server examples](#verification-server-examples)
3939

4040
**Other allowed parameters:**
41-
- **`primary_regex_name`**: This parameter allows you designate the primary regex pattern when multiple regex patterns are defined in the regex section. If a match is found, the match for the designated primary regex will be used to determine the line number. The value must be one of the names specified in the regex section.
41+
- **`primary_regex_name`**: This parameter allows you designate the primary regex pattern when multiple regex patterns are defined in the regex section. If a match is found, the match for the designated primary regex will be used to determine the line number. The value must be one of the names specified in the regex section. If not provided, the first regex name in sorted order will be used as the primary regex by default.
4242
- **`exclude_regexes_capture`**: This parameter allows you to define regex patterns to exclude specific parts of a detected secret. If a match is found within the detected secret, the portion matching this regex is excluded from the result.
4343
- **`exclude_regexes_match`**: This parameter enables you to define regex patterns to exclude entire matches from being reported as secrets. This applies to the entire matched string, not just the token.
4444
- **`entropy`**: This parameter is used to assess the randomness of detected strings. High entropy often indicates that a string is a potential secret, such as an API key or password, due to its complexity and unpredictability. It helps in filtering false-positives. While an entropy threshold of `3` can be a starting point, it's essential to adjust this value based on your project's specific requirements and the nature of the data you have.

pkg/custom_detectors/custom_detectors.go

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ func NewWebhookCustomRegex(pb *custom_detectorspb.CustomRegex) (*CustomRegexWebh
6666
}
6767
}
6868

69+
// Ensure primary regex name is set.
70+
ensurePrimaryRegexNameSet(pb)
71+
6972
// TODO: Copy only necessary data out of pb.
7073
return &CustomRegexWebhook{pb}, nil
7174
}
@@ -229,14 +232,27 @@ func (c *CustomRegexWebhook) createResults(ctx context.Context, match map[string
229232
values := match[key]
230233
// values[0] contains the entire regex match.
231234
secret := values[0]
235+
fullMatch := values[0]
232236
if len(values) > 1 {
233237
secret = values[1]
234238
}
235239
raw += secret
236240

237-
// if the match is of the primary regex, set it's value as primary secret value in result
241+
// We set the full regex match as the primary secret value.
242+
// Reasoning:
243+
// The engine calculates the line number using the match. When a primary secret is set, it uses that value instead of the raw secret.
244+
// While the secret match itself is sufficient to calculate the line number, the same group match could appear elsewhere in the data.
245+
// To avoid ambiguity, we store the full regex match as the primary secret value.
246+
// This primary secret value is used only for identifying the exact line number and is not used anywhere else.
247+
248+
// Example:
249+
// Full regex match: secret = ABC123
250+
// Secret (raw): ABC123
251+
252+
// In this case, the primary secret value stores the full string `secret = ABC123`,
253+
// allowing the engine to pinpoint the exact location and avoid matching redundant occurrences of `ABC123` in the data.
238254
if c.PrimaryRegexName == key {
239-
result.SetPrimarySecretValue(secret)
255+
result.SetPrimarySecretValue(fullMatch)
240256
}
241257
}
242258

@@ -394,3 +410,15 @@ func (c *CustomRegexWebhook) Description() string {
394410
}
395411
return c.GetDescription()
396412
}
413+
414+
// ensurePrimaryRegexNameSet sets the PrimaryRegexName field to the
415+
// first regex name in sorted order if it is not already set.
416+
// We're sorting to ensure deterministic behavior.
417+
func ensurePrimaryRegexNameSet(pb *custom_detectorspb.CustomRegex) {
418+
if pb.PrimaryRegexName == "" {
419+
for _, name := range slices.Sorted(maps.Keys(pb.Regex)) {
420+
pb.PrimaryRegexName = name
421+
return
422+
}
423+
}
424+
}

pkg/custom_detectors/custom_detectors_test.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,60 @@ func TestDetectorPrimarySecret(t *testing.T) {
232232
assert.Equal(t, "secret_YI7C90ACY1_yy", results[0].GetPrimarySecretValue())
233233
}
234234

235+
func TestDetectorPrimarySecretFullMatch(t *testing.T) {
236+
tests := []struct {
237+
name string
238+
input *custom_detectorspb.CustomRegex
239+
chunk []byte
240+
want string
241+
}{
242+
{
243+
name: "primary regex full match",
244+
input: &custom_detectorspb.CustomRegex{
245+
Name: "test",
246+
Keywords: []string{"secret"},
247+
Regex: map[string]string{"secret": `secret *= *"([^"\r\n]+)"`},
248+
PrimaryRegexName: "secret",
249+
},
250+
chunk: []byte(`
251+
// some code
252+
secret="mysecret"
253+
// some code
254+
`),
255+
want: `secret="mysecret"`,
256+
},
257+
{
258+
name: "primary regex full match multiline",
259+
input: &custom_detectorspb.CustomRegex{
260+
Name: "test",
261+
Keywords: []string{"secret"},
262+
Regex: map[string]string{"secret": `secret *= *"([^"]+)"`},
263+
PrimaryRegexName: "secret",
264+
},
265+
chunk: []byte(`
266+
// some code
267+
secret="mysecret
268+
thatspansmultiplelines"
269+
// some code
270+
`),
271+
want: `secret="mysecret
272+
thatspansmultiplelines"`,
273+
},
274+
}
275+
276+
for _, tt := range tests {
277+
t.Run(tt.name, func(t *testing.T) {
278+
detector, err := NewWebhookCustomRegex(tt.input)
279+
assert.NoError(t, err)
280+
results, err := detector.FromData(context.Background(), false, tt.chunk)
281+
assert.NoError(t, err)
282+
assert.Equal(t, 1, len(results))
283+
assert.Equal(t, tt.want, results[0].GetPrimarySecretValue())
284+
})
285+
}
286+
287+
}
288+
235289
func TestDetectorValidations(t *testing.T) {
236290
type args struct {
237291
CustomRegex *custom_detectorspb.CustomRegex
@@ -707,6 +761,24 @@ func TestNewWebhookCustomRegex_Validation(t *testing.T) {
707761
}
708762
}
709763

764+
func TestNewWebhookCustomRegex_EnsurePrimaryRegexNameSet(t *testing.T) {
765+
t.Parallel()
766+
767+
pb := &custom_detectorspb.CustomRegex{
768+
Name: "test",
769+
Keywords: []string{"kw"},
770+
Regex: map[string]string{
771+
"regex_a": `regex_a`,
772+
"regex_b": `regex_b`,
773+
},
774+
// PrimaryRegexName is not set.
775+
}
776+
777+
detector, err := NewWebhookCustomRegex(pb)
778+
assert.NoError(t, err)
779+
assert.Equal(t, "regex_a", detector.GetPrimaryRegexName(), "expected PrimaryRegexName to be set to regex_a")
780+
}
781+
710782
func BenchmarkProductIndices(b *testing.B) {
711783
for i := 0; i < b.N; i++ {
712784
_ = productIndices(3, 2, 6)

pkg/engine/engine_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,21 @@ func TestFragmentLineOffsetWithPrimarySecret(t *testing.T) {
215215
}
216216
}
217217

218+
func TestFragmentLineOffsetWithPrimarySecretMultiline(t *testing.T) {
219+
result := &detectors.Result{
220+
Raw: []byte("secret here"),
221+
}
222+
result.SetPrimarySecretValue("secret:\nsecret here")
223+
224+
chunk := &sources.Chunk{
225+
Data: []byte("line1\nline2\nsecret:\nsecret here\nline5"),
226+
}
227+
lineOffset, isIgnored := FragmentLineOffset(chunk, result)
228+
assert.False(t, isIgnored)
229+
// offset 2 means line 3
230+
assert.Equal(t, int64(2), lineOffset)
231+
}
232+
218233
func setupFragmentLineOffsetBench(totalLines, needleLine int) (*sources.Chunk, *detectors.Result) {
219234
data := make([]byte, 0, 4096)
220235
needle := []byte("needle")

0 commit comments

Comments
 (0)