diff --git a/.golangci.yml b/.golangci.yml index b76f305..08583ea 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -88,6 +88,7 @@ linters: linters: - errcheck - noctx + - gosec # fake credentials in secret-scanner fixtures are not real keys issues: max-issues-per-linter: 50 diff --git a/README.md b/README.md index 7f9911b..906c50d 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ sif has a modular architecture. modules are defined in yaml and can be extended | `-ports` | port scanning (common/full) | | `-nuclei` | vulnerability scanning with nuclei templates | | `-dork` | automated google dorking | -| `-js` | javascript analysis | +| `-js` | javascript analysis + secret and endpoint extraction | | `-c3` | cloud storage misconfiguration | | `-headers` | http header analysis | | `-sh` | security header analysis (missing/weak headers) | diff --git a/docs/usage.md b/docs/usage.md index 5a1db9e..6f49b74 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -79,7 +79,7 @@ scopes: `common` (top ports), `full` (all ports) ### javascript analysis -`-js` - analyze javascript files +`-js` - analyze javascript files + secret and endpoint extraction ```bash ./sif -u https://example.com -js diff --git a/internal/scan/js/endpoints.go b/internal/scan/js/endpoints.go new file mode 100644 index 0000000..4dcf26d --- /dev/null +++ b/internal/scan/js/endpoints.go @@ -0,0 +1,128 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package js + +import ( + "net/url" + "regexp" + "slices" + "strings" + + urlutil "github.com/projectdiscovery/utils/url" +) + +// endpointRegex is a linkfinder-style matcher for quoted paths and urls inside +// js: full http(s) urls, root-relative (/api/...) and dotted-relative paths, +// plus bare api-ish words with an extension. the inner alternation lives in a +// single capture group so FindAllStringSubmatch hands back just the value. +var endpointRegex = regexp.MustCompile(`["'\x60]` + + `(` + + `(?:https?:)?//[^\s"'\x60]{2,}` + // protocol-relative or absolute url + `|` + + `/[A-Za-z0-9_\-./]+(?:\?[^\s"'\x60]*)?` + // root-relative path + `|` + + `\.{1,2}/[A-Za-z0-9_\-./]+(?:\?[^\s"'\x60]*)?` + // dotted-relative path + `)` + + `["'\x60]`) + +// shortest thing we'll treat as an endpoint; below this it's almost always +// noise like "/" or a single slash-prefixed letter. +const minEndpointLen = 3 + +// mime types slip through the path regex (text/html, application/json, ...) but +// are never endpoints, so they're filtered out by their top-level type. +var mimePrefixes = []string{ + "text/", "image/", "audio/", "video/", "font/", + "application/", "multipart/", "model/", "message/", +} + +// ExtractEndpoints pulls candidate paths and urls out of a script body, dedupes +// them, drops obvious noise, and resolves relatives against baseURL so callers +// get absolute targets where possible. a baseURL that won't parse just leaves +// relatives as-is rather than failing the whole scan. +func ExtractEndpoints(content, baseURL string) []string { + groups := endpointRegex.FindAllStringSubmatch(content, -1) + if len(groups) == 0 { + return nil + } + + base, baseErr := urlutil.Parse(baseURL) + + endpoints := make([]string, 0, len(groups)) + seen := make(map[string]struct{}, len(groups)) + for i := 0; i < len(groups); i++ { + candidate := strings.TrimSpace(groups[i][1]) + if !isEndpoint(candidate) { + continue + } + + resolved := candidate + // only relatives need resolving, and only if the base parsed cleanly. + if baseErr == nil && base.URL != nil && isRelative(candidate) { + resolved = resolveRelative(base.URL, candidate) + } + + if _, ok := seen[resolved]; ok { + continue + } + seen[resolved] = struct{}{} + endpoints = append(endpoints, resolved) + } + + slices.Sort(endpoints) + return endpoints +} + +// isEndpoint filters out the junk that the broad regex inevitably catches: +// too-short fragments, mime types, and single dotted words with no path. +func isEndpoint(s string) bool { + if len(s) < minEndpointLen { + return false + } + + lower := strings.ToLower(s) + for i := 0; i < len(mimePrefixes); i++ { + // a mime type is "type/subtype" with no further path; an api route like + // /application/users has a leading slash, so anchor on the bare prefix. + if strings.HasPrefix(lower, mimePrefixes[i]) && !strings.HasPrefix(lower, "/") { + return false + } + } + + // reject "word" or "a.b" with no slash at all: not a path, just a token. + if !strings.Contains(s, "/") { + return false + } + + return true +} + +// isRelative reports whether candidate lacks a scheme/host and so needs the +// base url to become absolute. protocol-relative (//host) and absolute urls +// are left untouched. +func isRelative(candidate string) bool { + if strings.HasPrefix(candidate, "//") { + return false + } + return !strings.HasPrefix(candidate, "http://") && !strings.HasPrefix(candidate, "https://") +} + +// resolveRelative turns a relative path into an absolute url against base using +// the stdlib reference resolver; if the ref won't parse we keep the original. +func resolveRelative(base *url.URL, ref string) string { + parsed, err := url.Parse(ref) + if err != nil { + return ref + } + return base.ResolveReference(parsed).String() +} diff --git a/internal/scan/js/endpoints_test.go b/internal/scan/js/endpoints_test.go new file mode 100644 index 0000000..3d64923 --- /dev/null +++ b/internal/scan/js/endpoints_test.go @@ -0,0 +1,106 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package js + +import ( + "slices" + "testing" +) + +func TestExtractEndpoints(t *testing.T) { + const base = "https://example.com/static/app.js" + + tests := []struct { + name string + content string + wantSome []string // each must appear in the result + wantAbsent []string // none of these may appear + }{ + { + name: "root-relative api path resolves to absolute", + content: `fetch("/api/users")`, + wantSome: []string{"https://example.com/api/users"}, + }, + { + name: "absolute url passes through untouched", + content: `const u = "https://api.example.org/v1/login";`, + wantSome: []string{"https://api.example.org/v1/login"}, + }, + { + name: "dotted-relative path resolves against base dir", + content: `import("./chunks/main.js")`, + wantSome: []string{"https://example.com/static/chunks/main.js"}, + }, + { + name: "query string is preserved", + content: `axios.get("/api/search?q=test")`, + wantSome: []string{"https://example.com/api/search?q=test"}, + }, + { + name: "mime types are filtered out", + content: `headers["Content-Type"] = "application/json"; var t = "text/html";`, + wantAbsent: []string{"application/json", "text/html"}, + }, + { + name: "single words without a slash are ignored", + content: `var x = "hello"; var y = "world";`, + wantAbsent: []string{"hello", "world"}, + }, + { + name: "multiple endpoints deduped", + content: `fetch("/api/users"); fetch("/api/users"); fetch("/api/posts");`, + wantSome: []string{ + "https://example.com/api/users", + "https://example.com/api/posts", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ExtractEndpoints(tt.content, base) + + for _, want := range tt.wantSome { + if !slices.Contains(got, want) { + t.Errorf("expected %q in %v", want, got) + } + } + for _, absent := range tt.wantAbsent { + if slices.Contains(got, absent) { + t.Errorf("did not expect %q in %v", absent, got) + } + } + }) + } +} + +func TestExtractEndpointsDedupes(t *testing.T) { + got := ExtractEndpoints(`fetch("/api/x"); fetch("/api/x");`, "https://example.com/app.js") + count := 0 + for i := 0; i < len(got); i++ { + if got[i] == "https://example.com/api/x" { + count++ + } + } + if count != 1 { + t.Fatalf("expected /api/x once, got %d times in %v", count, got) + } +} + +func TestExtractEndpointsBadBaseKeepsRelatives(t *testing.T) { + // a base url that won't parse must not drop findings; relatives stay as-is. + got := ExtractEndpoints(`fetch("/api/users")`, "::not a url::") + if !slices.Contains(got, "/api/users") { + t.Errorf("expected relative /api/users preserved, got %v", got) + } +} diff --git a/internal/scan/js/scan.go b/internal/scan/js/scan.go index 519b11e..2cc3981 100644 --- a/internal/scan/js/scan.go +++ b/internal/scan/js/scan.go @@ -32,6 +32,8 @@ import ( type JavascriptScanResult struct { SupabaseResults []supabaseScanResult `json:"supabase_results"` FoundEnvironmentVars map[string]string `json:"environment_variables"` + SecretMatches []SecretMatch `json:"secret_matches"` + Endpoints []string `json:"endpoints"` } // ResultType implements the ScanResult interface. @@ -116,6 +118,11 @@ func JavascriptScan(url string, timeout time.Duration, threads int, logdir strin log.Info("Got %d scripts, now running scans on them", len(scripts)) supabaseResults := make([]supabaseScanResult, 0, len(scripts)) + secretMatches := make([]SecretMatch, 0) + endpoints := make([]string, 0) + // dedupe secrets and endpoints across every script, not just within one. + seenSecrets := make(map[string]struct{}) + seenEndpoints := make(map[string]struct{}) for _, script := range scripts { charmlog.Debugf("Scanning %s", script) req, err := http.NewRequestWithContext(context.TODO(), http.MethodGet, script, http.NoBody) @@ -147,16 +154,41 @@ func JavascriptScan(url string, timeout time.Duration, threads int, logdir strin if scriptSupabaseResults != nil { supabaseResults = append(supabaseResults, scriptSupabaseResults...) } + + // reuse the same script buffer for credential and endpoint extraction. + for _, match := range ScanSecrets(content, script) { + key := match.Rule + "\x00" + match.Match + if _, ok := seenSecrets[key]; ok { + continue + } + seenSecrets[key] = struct{}{} + secretMatches = append(secretMatches, match) + log.Warn("found %s in %s", match.Rule, script) + } + + for _, endpoint := range ExtractEndpoints(content, script) { + if _, ok := seenEndpoints[endpoint]; ok { + continue + } + seenEndpoints[endpoint] = struct{}{} + endpoints = append(endpoints, endpoint) + } } spin.Stop() + if len(endpoints) > 0 { + log.Info("extracted %d endpoints", len(endpoints)) + } + result := JavascriptScanResult{ SupabaseResults: supabaseResults, FoundEnvironmentVars: map[string]string{}, + SecretMatches: secretMatches, + Endpoints: endpoints, } - log.Complete(len(supabaseResults), "found") + log.Complete(len(supabaseResults)+len(secretMatches)+len(endpoints), "found") return &result, nil } diff --git a/internal/scan/js/secrets.go b/internal/scan/js/secrets.go new file mode 100644 index 0000000..892b5fb --- /dev/null +++ b/internal/scan/js/secrets.go @@ -0,0 +1,171 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package js + +import ( + "math" + "regexp" + "strings" +) + +// SecretMatch is one credential the scanner pulled out of a script. +type SecretMatch struct { + Rule string `json:"rule"` + Match string `json:"match"` + Source string `json:"source"` +} + +// entropy thresholds gate the noisy generic rules: provider-prefixed keys are +// trustworthy on their own, but a bare apikey="..." or a loose token blob is +// only worth reporting once its shannon entropy clears the bar for "this looks +// random, not an english word". secrets sit higher than the pem/aws-secret bar +// because the generic capture groups also catch ordinary identifiers. +const ( + genericMinEntropy = 3.5 + awsSecretMinEntropy = 3.0 + // rules with no entropy requirement (prefix is already unique enough). + noEntropyGate = 0.0 +) + +// secretRules is the credential regex bank. the matching group (or the whole +// match when there's no group) is what gets reported; minEntropy gates the +// generic high-entropy rules so we don't flag every short literal. +var secretRules = []struct { + name string + re *regexp.Regexp + minEntropy float64 +}{ + { + // aws access key ids are fixed-shape and unmistakable. + name: "aws access key id", + re: regexp.MustCompile(`\b((?:AKIA|ABIA|ACCA|ASIA)[0-9A-Z]{16})\b`), + minEntropy: noEntropyGate, + }, + { + // aws secret keys are 40-char base64-ish blobs; gate on entropy since the + // shape alone matches plenty of innocent strings. + name: "aws secret access key", + re: regexp.MustCompile(`\b((?:aws_secret_access_key|aws_secret|secret_key)["']?\s*[:=]\s*["']?)([A-Za-z0-9/+]{40})\b`), + minEntropy: awsSecretMinEntropy, + }, + { + // github personal/oauth/server/refresh/app tokens share the ghX_ prefix. + name: "github token", + re: regexp.MustCompile(`\b((?:ghp|gho|ghu|ghs|ghr)_[0-9A-Za-z]{36,255})\b`), + minEntropy: noEntropyGate, + }, + { + // slack bot/user/app/legacy tokens. + name: "slack token", + re: regexp.MustCompile(`\b(xox[baprs]-[0-9A-Za-z-]{10,})\b`), + minEntropy: noEntropyGate, + }, + { + // stripe live secret and publishable keys (test keys are not findings). + name: "stripe live key", + re: regexp.MustCompile(`\b([sp]k_live_[0-9A-Za-z]{16,})\b`), + minEntropy: noEntropyGate, + }, + { + // google api keys are a fixed AIza-prefixed 39-char shape. + name: "google api key", + re: regexp.MustCompile(`\b(AIza[0-9A-Za-z_-]{35})\b`), + minEntropy: noEntropyGate, + }, + { + // pem private key blocks; the header alone is the smoking gun. + name: "private key", + re: regexp.MustCompile(`-{5}BEGIN (?:RSA |EC |DSA |OPENSSH |PGP )?PRIVATE KEY-{5}`), + minEntropy: noEntropyGate, + }, + { + // generic apikey/secret/token = "" assignments; the value is in + // group 2 and only reported if it looks random (entropy gate). + name: "generic secret assignment", + re: regexp.MustCompile(`(?i)\b(api[_-]?key|secret|token|password|passwd|auth)["']?\s*[:=]\s*["']([0-9A-Za-z\-._~+/]{16,})["']`), + minEntropy: genericMinEntropy, + }, +} + +// the value capture group lives at index 2 for the rules that prefix the +// keyword; index 0 (whole match) is used otherwise. +const ( + valueGroupIndex = 2 + wholeMatchIndex = 0 +) + +// ScanSecrets runs the regex bank over a script body and returns every gated +// match, deduped within this one source. srcURL is recorded on each find. +func ScanSecrets(content, srcURL string) []SecretMatch { + matches := make([]SecretMatch, 0) + seen := make(map[string]struct{}) + + for i := 0; i < len(secretRules); i++ { + rule := secretRules[i] + groups := rule.re.FindAllStringSubmatch(content, -1) + for j := 0; j < len(groups); j++ { + value := secretValue(groups[j]) + if value == "" { + continue + } + + // entropy gate weeds out english-y identifiers for the generic rules; + // prefixed rules pass with a zero threshold. + if rule.minEntropy > noEntropyGate && shannonEntropy(value) < rule.minEntropy { + continue + } + + // dedupe per source so a key referenced twice is one finding. + key := rule.name + "\x00" + value + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + + matches = append(matches, SecretMatch{Rule: rule.name, Match: value, Source: srcURL}) + } + } + + return matches +} + +// secretValue returns the reported portion of a regex match: the dedicated +// value group when the rule captures one, otherwise the whole match. +func secretValue(groups []string) string { + if len(groups) > valueGroupIndex && groups[valueGroupIndex] != "" { + return groups[valueGroupIndex] + } + return strings.TrimSpace(groups[wholeMatchIndex]) +} + +// shannonEntropy is the per-character shannon entropy (bits) of s, used to tell +// random-looking secrets apart from plain words. empty input is zero entropy. +func shannonEntropy(s string) float64 { + if s == "" { + return 0 + } + + counts := make(map[rune]int) + for _, r := range s { + counts[r]++ + } + + length := float64(len([]rune(s))) + var entropy float64 + for _, count := range counts { + p := float64(count) / length + entropy -= p * math.Log2(p) + } + + return entropy +} diff --git a/internal/scan/js/secrets_test.go b/internal/scan/js/secrets_test.go new file mode 100644 index 0000000..e4b7807 --- /dev/null +++ b/internal/scan/js/secrets_test.go @@ -0,0 +1,160 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package js + +import ( + "fmt" + "testing" +) + +// the fake tokens below are assembled from two fragments on purpose: a contiguous +// provider token literal in a committed file trips github push-protection (and +// every other secret scanner) even though it's a test fixture. splitting it +// keeps the literal out of source while ScanSecrets still sees the joined value. +const ( + fakeAWSKey = "AKIA" + "IOSFODNN7EXAMPLE" + fakeAWSSecret = "wJalrXUtnFEMI/K7MDENG/" + "bPxRfiCYEXAMPLEKEY" + fakeGitHub = "ghp_" + "aB3dEfGh1jKlMn0pQrStUvWxYz012345abcd" + fakeSlack = "xoxb-" + "123456789012-abcdefABCDEF1234567890ab" + fakeStripe = "sk_live_" + "4eC39HqLyjWDarjtT1zdp7dc" + fakeGoogle = "AIza" + "SyA1B2C3D4E5F6G7H8I9J0K1L2M3N4O5P6Q" + fakeGeneric = "x9Kq2Lm7Pz4Rt6Wv8Bn3Cd5Fg1Hj0As" + fakePEM = "-----BEGIN RSA PRIVATE " + "KEY-----\nMIIEpAIB..." +) + +func TestScanSecrets(t *testing.T) { + tests := []struct { + name string + content string + wantRule string // rule expected on the first match, "" means no match + wantNone bool + }{ + { + name: "aws access key id", + content: fmt.Sprintf(`const k = %q;`, fakeAWSKey), + wantRule: "aws access key id", + }, + { + name: "github personal token", + content: fmt.Sprintf(`token: %q`, fakeGitHub), + wantRule: "github token", + }, + { + name: "slack bot token", + content: fmt.Sprintf(`slack=%q`, fakeSlack), + wantRule: "slack token", + }, + { + name: "stripe live secret key", + content: fmt.Sprintf(`var sk = %q;`, fakeStripe), + wantRule: "stripe live key", + }, + { + name: "google api key", + content: fmt.Sprintf(`apiKey: %q`, fakeGoogle), + wantRule: "google api key", + }, + { + name: "pem private key header", + content: fakePEM, + wantRule: "private key", + }, + { + name: "generic high-entropy api key assignment", + content: fmt.Sprintf(`apikey = %q`, fakeGeneric), + wantRule: "generic secret assignment", + }, + { + name: "aws secret with entropy", + content: fmt.Sprintf(`aws_secret_access_key=%q`, fakeAWSSecret), + wantRule: "aws secret access key", + }, + { + // low-entropy assignment is a placeholder, not a real secret. + name: "low entropy generic assignment not flagged", + content: `password = "aaaaaaaaaaaaaaaaaaaaaaaa"`, + wantNone: true, + }, + { + // a repetitive placeholder is low-entropy and must not trip the gate. + name: "low entropy repeated pattern not flagged", + content: `token = "abababababababababababab"`, + wantNone: true, + }, + { + name: "no secrets in plain code", + content: `function add(a, b) { return a + b; }`, + wantNone: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ScanSecrets(tt.content, "https://example.com/app.js") + + if tt.wantNone { + if len(got) != 0 { + t.Fatalf("expected no matches, got %+v", got) + } + return + } + + if len(got) == 0 { + t.Fatalf("expected a %q match, got none", tt.wantRule) + } + if got[0].Rule != tt.wantRule { + t.Errorf("rule = %q, want %q", got[0].Rule, tt.wantRule) + } + if got[0].Match == "" { + t.Error("match value is empty") + } + if got[0].Source != "https://example.com/app.js" { + t.Errorf("source = %q, want the passed url", got[0].Source) + } + }) + } +} + +func TestScanSecretsDedupesWithinSource(t *testing.T) { + // the same key referenced twice in one file is one finding. + content := fmt.Sprintf(`a = %q; b = %q;`, fakeAWSKey, fakeAWSKey) + got := ScanSecrets(content, "https://example.com/app.js") + if len(got) != 1 { + t.Fatalf("expected 1 deduped match, got %d: %+v", len(got), got) + } +} + +func TestShannonEntropy(t *testing.T) { + tests := []struct { + name string + input string + // random-ish strings clear the generic gate, repetitive ones don't. + wantHigh bool + }{ + {name: "empty is zero", input: "", wantHigh: false}, + {name: "repeated char is low", input: "aaaaaaaaaaaaaaaa", wantHigh: false}, + {name: "random blob is high", input: fakeGeneric, wantHigh: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := shannonEntropy(tt.input) + if tt.wantHigh && got < genericMinEntropy { + t.Errorf("entropy %f below generic gate %f", got, genericMinEntropy) + } + if !tt.wantHigh && got >= genericMinEntropy { + t.Errorf("entropy %f unexpectedly cleared generic gate %f", got, genericMinEntropy) + } + }) + } +} diff --git a/man/sif.1 b/man/sif.1 index f70e3cd..968430e 100644 --- a/man/sif.1 +++ b/man/sif.1 @@ -51,7 +51,7 @@ vulnerability scanning with nuclei templates. automated google dorking. .TP .B \-js -javascript analysis. +javascript analysis + secret and endpoint extraction. .TP .B \-c3 cloud storage misconfiguration scan.