From b4e78114d7d8d982eb02bb0216b1b3d00c45e6e5 Mon Sep 17 00:00:00 2001 From: vmfunc Date: Tue, 9 Jun 2026 17:54:23 -0700 Subject: [PATCH 1/3] feat(js): extract secrets and endpoints from scanned javascript the -js pipeline already pulls every is a script context + for { + open := strings.Index(body, "") + if closeIdx < 0 { + break + } + segment := body[open : open+closeIdx] + if strings.Contains(segment, canaryToken) { + return "script" + } + body = body[open+closeIdx+len(""):] + } + + // default: echoed inside an html attribute value + return "attribute" +} + +// survivingBreakChars reports which dangerous chars came back next to the canary +// unescaped. We only trust occurrences adjacent to the token so unrelated chars +// elsewhere on the page don't create false positives. +func survivingBreakChars(body string) []string { + survived := make([]string, 0, len(xssBreakChars)) + markers := []string{ + "<" + canaryToken, // leading < survived + canaryToken + ">", // trailing > survived + "\"" + canaryToken, // leading " survived + canaryToken + "'", // trailing ' survived + "`" + canaryToken, // backtick wrap survived (token + ` and ` + token) + canaryToken + "`", + } + present := make(map[string]bool, len(xssBreakChars)) + for i := 0; i < len(markers); i++ { + if !strings.Contains(body, markers[i]) { + continue + } + switch { + case strings.HasPrefix(markers[i], "<"): + present["<"] = true + case strings.HasSuffix(markers[i], ">"): + present[">"] = true + case strings.HasPrefix(markers[i], "\""): + present["\""] = true + case strings.HasSuffix(markers[i], "'"): + present["'"] = true + default: + present["`"] = true + } + } + + // keep the canonical order for stable output + for i := 0; i < len(xssBreakChars); i++ { + if present[xssBreakChars[i]] { + survived = append(survived, xssBreakChars[i]) + } + } + return survived +} + +// relevantForContext filters surviving chars to the ones that actually enable a +// breakout in the detected context: angle brackets matter in html, quotes and +// backticks matter inside attributes/scripts. +func relevantForContext(reflectCtx string, survived []string) []string { + wanted := make(map[string]bool, len(survived)) + switch reflectCtx { + case "html": + wanted["<"] = true + wanted[">"] = true + case "attribute": + // breaking out of an attribute value needs the quote that delimits it; a + // bare backtick isn't a delimiter in html, so it doesn't count here. + wanted["\""] = true + wanted["'"] = true + case "script": + // a quote, backtick, or angle bracket all let you close/escape the script + wanted["\""] = true + wanted["'"] = true + wanted["`"] = true + wanted["<"] = true + wanted[">"] = true + } + + filtered := make([]string, 0, len(survived)) + for i := 0; i < len(survived); i++ { + if wanted[survived[i]] { + filtered = append(filtered, survived[i]) + } + } + return filtered +} + +// ResultType identifies reflected-xss findings for the result registry. +func (r *XSSResult) ResultType() string { return "xss" } + +var _ ScanResult = (*XSSResult)(nil) diff --git a/internal/scan/xss_test.go b/internal/scan/xss_test.go new file mode 100644 index 0000000..66ade5a --- /dev/null +++ b/internal/scan/xss_test.go @@ -0,0 +1,153 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "html" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// reflectsRaw echoes the named param straight into html text, so the breaking +// chars survive unescaped - a reflected xss sink. +func reflectsRaw(param string) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + v := r.URL.Query().Get(param) + w.Header().Set("Content-Type", "text/html") + w.WriteHeader(http.StatusOK) + //nolint:gosec // deliberate reflected-xss fixture for the probe under test + w.Write([]byte("
" + v + "
")) + })) +} + +func TestXSS_DetectsRawHTMLReflection(t *testing.T) { + srv := reflectsRaw("q") + defer srv.Close() + + result, err := XSS(srv.URL, 5*time.Second, 4, "") + if err != nil { + t.Fatalf("XSS: %v", err) + } + if result == nil || len(result.Findings) == 0 { + t.Fatalf("expected reflected xss findings, got %+v", result) + } + + var found *XSSFinding + for i := range result.Findings { + if result.Findings[i].Parameter == "q" { + found = &result.Findings[i] + } + } + if found == nil { + t.Fatalf("expected a finding on param 'q', got %+v", result.Findings) + } + if found.Context != "html" { + t.Errorf("expected html context, got %s", found.Context) + } + if len(found.SurvivedRaw) == 0 { + t.Errorf("expected surviving breaking chars, got none") + } +} + +func TestXSS_NoFalsePositiveWhenEscaped(t *testing.T) { + // the server html-escapes the reflection, so no breaking char survives raw. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + v := r.URL.Query().Get("q") + w.Header().Set("Content-Type", "text/html") + w.WriteHeader(http.StatusOK) + w.Write([]byte("
" + html.EscapeString(v) + "
")) + })) + defer srv.Close() + + result, err := XSS(srv.URL, 5*time.Second, 4, "") + if err != nil { + t.Fatalf("XSS: %v", err) + } + if result != nil && len(result.Findings) > 0 { + t.Errorf("expected no findings when reflection is escaped, got %+v", result.Findings) + } +} + +func TestXSS_NoFalsePositiveWhenNotReflected(t *testing.T) { + // never echoes the input back, so nothing is injectable. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.WriteHeader(http.StatusOK) + w.Write([]byte("static page")) + })) + defer srv.Close() + + result, err := XSS(srv.URL, 5*time.Second, 4, "") + if err != nil { + t.Fatalf("XSS: %v", err) + } + if result != nil && len(result.Findings) > 0 { + t.Errorf("expected no findings on static page, got %+v", result.Findings) + } +} + +func TestClassifyXSSContext(t *testing.T) { + tests := []struct { + name string + body string + want string + }{ + { + name: "live html tag", + body: "
<" + canaryToken + ">
", + want: "html", + }, + { + name: "inside script block", + body: "", + want: "script", + }, + { + name: "attribute value", + body: ``, + want: "attribute", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := classifyXSSContext(tt.body); got != tt.want { + t.Errorf("classifyXSSContext() = %q, want %q", got, tt.want) + } + }) + } +} + +func TestSurvivingBreakChars(t *testing.T) { + // the canary is wrapped exactly as the probe injects it; all five chars survive. + body := "<" + canaryToken + ">\"" + canaryToken + "'`" + canaryToken + "`" + got := survivingBreakChars(body) + want := map[string]bool{"<": true, ">": true, "\"": true, "'": true, "`": true} + if len(got) != len(want) { + t.Fatalf("expected %d surviving chars, got %v", len(want), got) + } + for _, c := range got { + if !want[c] { + t.Errorf("unexpected surviving char %q", c) + } + } +} + +func TestXSSResult_ResultType(t *testing.T) { + r := &XSSResult{} + if r.ResultType() != "xss" { + t.Errorf("expected result type 'xss', got %q", r.ResultType()) + } +} diff --git a/man/sif.1 b/man/sif.1 index 968430e..4646086 100644 --- a/man/sif.1 +++ b/man/sif.1 @@ -86,6 +86,15 @@ sql reconnaissance (admin panels, error disclosure). .B \-lfi local file inclusion reconnaissance. .TP +.B \-cors +cors misconfiguration probe (reflected/permissive origins). +.TP +.B \-redirect +open redirect probe. +.TP +.B \-xss +reflected xss probe. +.TP .B \-framework framework detection with cve lookup. .TP diff --git a/sif.go b/sif.go index cee06fd..e1c6b09 100644 --- a/sif.go +++ b/sif.go @@ -391,6 +391,36 @@ func (app *App) Run() error { } } + if app.settings.CORS { + result, err := scan.CORS(url, app.settings.Timeout, app.settings.Threads, app.settings.LogDir) + if err != nil { + log.Errorf("Error while running CORS probe: %s", err) + } else if result != nil { + moduleResults = append(moduleResults, NewModuleResult(result)) + scansRun = append(scansRun, "CORS") + } + } + + if app.settings.Redirect { + result, err := scan.Redirect(url, app.settings.Timeout, app.settings.Threads, app.settings.LogDir) + if err != nil { + log.Errorf("Error while running open redirect probe: %s", err) + } else if result != nil { + moduleResults = append(moduleResults, NewModuleResult(result)) + scansRun = append(scansRun, "Open Redirect") + } + } + + if app.settings.XSS { + result, err := scan.XSS(url, app.settings.Timeout, app.settings.Threads, app.settings.LogDir) + if err != nil { + log.Errorf("Error while running reflected XSS probe: %s", err) + } else if result != nil { + moduleResults = append(moduleResults, NewModuleResult(result)) + scansRun = append(scansRun, "Reflected XSS") + } + } + // Load and run modules if app.settings.AllModules || app.settings.Modules != "" || app.settings.ModuleTags != "" { loader, err := modules.NewLoader() From dbe79c495ee8bc7d7012a03a5fda9bb4648522b3 Mon Sep 17 00:00:00 2001 From: vmfunc Date: Tue, 9 Jun 2026 17:57:42 -0700 Subject: [PATCH 3/3] feat(scan): add web crawler and passive subdomain/url discovery -crawl spiders same-host links/scripts/forms through the shared httpx client so proxy/headers/rate-limit and robots.txt are honored, bounded by -crawl-depth. -passive pulls subdomains from keyless ct feeds (crt.sh, certspotter) and historical urls from wayback, each source isolated so one feed being down doesn't sink the rest and the target sees no traffic. --- README.md | 3 + docs/usage.md | 20 +++ go.mod | 2 +- internal/config/config.go | 10 ++ internal/scan/crawl.go | 137 +++++++++++++++++ internal/scan/crawl_test.go | 158 ++++++++++++++++++++ internal/scan/passive.go | 266 ++++++++++++++++++++++++++++++++++ internal/scan/passive_test.go | 163 +++++++++++++++++++++ man/sif.1 | 9 ++ sif.go | 20 +++ 10 files changed, 787 insertions(+), 1 deletion(-) create mode 100644 internal/scan/crawl.go create mode 100644 internal/scan/crawl_test.go create mode 100644 internal/scan/passive.go create mode 100644 internal/scan/passive_test.go diff --git a/README.md b/README.md index d780762..59359f3 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,9 @@ sif has a modular architecture. modules are defined in yaml and can be extended | `-redirect` | open redirect probe | | `-xss` | reflected xss probe | | `-framework` | framework detection with cve lookup | +| `-crawl` | web crawler (spider same-host links/scripts/forms) | +| `-crawl-depth` | max crawl recursion depth (default 2) | +| `-passive` | passive subdomain/url discovery (zero traffic to target) | ### http options diff --git a/docs/usage.md b/docs/usage.md index c58e16a..ae75048 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -186,6 +186,26 @@ export SHODAN_API_KEY=your-api-key ./sif -u https://example.com -framework ``` +### web crawler + +`-crawl` - spider the target, following same-host links, scripts and forms + +`-crawl-depth` - max recursion depth (default 2). respects robots.txt and stays on the target host. + +```bash +./sif -u https://example.com -crawl -crawl-depth 3 +``` + +### passive discovery + +`-passive` - gather subdomains from certificate transparency (crt.sh, certspotter) and historical urls from the wayback machine + +keyless and zero traffic to the target itself - all lookups hit third-party feeds. + +```bash +./sif -u https://example.com -passive +``` + ### whois lookup `-whois` - perform whois lookups diff --git a/go.mod b/go.mod index 4382257..35bb982 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/charmbracelet/glamour v0.10.0 github.com/charmbracelet/lipgloss v1.1.1-0.20250404203927-76690c660834 github.com/charmbracelet/log v1.0.0 + github.com/gocolly/colly/v2 v2.1.0 github.com/likexian/whois v1.15.7 github.com/projectdiscovery/goflags v0.1.74 github.com/projectdiscovery/nuclei/v3 v3.8.0 @@ -160,7 +161,6 @@ require ( github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect github.com/goccy/go-json v0.10.5 // indirect - github.com/gocolly/colly/v2 v2.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang-jwt/jwt/v4 v4.5.2 // indirect github.com/golang-jwt/jwt/v5 v5.2.2 // indirect diff --git a/internal/config/config.go b/internal/config/config.go index c692925..bdd2f95 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -50,6 +50,9 @@ type Settings struct { Redirect bool XSS bool Framework bool + Crawl bool + CrawlDepth int + Passive bool Modules string // Comma-separated list of module IDs to run ModuleTags string // Run modules matching these tags AllModules bool // Run all loaded modules @@ -65,6 +68,10 @@ type Settings struct { // "negative WaitGroup counter"; clamp the parsed value up to this. const minThreads = 1 +// defaultCrawlDepth bounds how far the spider recurses by default; deep enough +// to find linked pages without crawling an entire site. +const defaultCrawlDepth = 2 + const ( Nil goflags.EnumVariable = iota @@ -114,6 +121,9 @@ func Parse() *Settings { flagSet.BoolVar(&settings.Redirect, "redirect", false, "Enable open redirect probe"), flagSet.BoolVar(&settings.XSS, "xss", false, "Enable reflected XSS probe"), flagSet.BoolVar(&settings.Framework, "framework", false, "Enable framework detection"), + flagSet.BoolVar(&settings.Crawl, "crawl", false, "Enable web crawling (spider same-host links/scripts/forms)"), + flagSet.IntVar(&settings.CrawlDepth, "crawl-depth", defaultCrawlDepth, "Max crawl recursion depth"), + flagSet.BoolVar(&settings.Passive, "passive", false, "Enable passive subdomain/url discovery (zero traffic to target)"), ) flagSet.CreateGroup("runtime", "Runtime", diff --git a/internal/scan/crawl.go b/internal/scan/crawl.go new file mode 100644 index 0000000..79a5859 --- /dev/null +++ b/internal/scan/crawl.go @@ -0,0 +1,137 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "fmt" + "net/url" + "sort" + "sync" + "time" + + "github.com/gocolly/colly/v2" + + "github.com/dropalldatabases/sif/internal/httpx" + "github.com/dropalldatabases/sif/internal/logger" + "github.com/dropalldatabases/sif/internal/output" +) + +// CrawlResult holds the deduped set of urls discovered by the spider. +type CrawlResult struct { + URLs []string `json:"urls"` +} + +func (r *CrawlResult) ResultType() string { return "crawl" } + +// compile-time check so a result-type drift fails the build, not a run. +var _ ScanResult = (*CrawlResult)(nil) + +// Crawl spiders the target up to depth, following same-host links/scripts/forms. +// all traffic flows through the shared httpx client so proxy/headers/rate-limit +// apply, and robots.txt is respected (colly honors it by default). +func Crawl(targetURL string, depth int, timeout time.Duration, logdir string) (*CrawlResult, error) { + log := output.Module("CRAWL") + log.Start() + + sanitizedURL := stripScheme(targetURL) + + if logdir != "" { + if err := logger.WriteHeader(sanitizedURL, logdir, "web crawl"); err != nil { + log.Error("error creating log file: %v", err) + return nil, fmt.Errorf("create crawl log: %w", err) + } + } + + // the host bounds the crawl; without it colly would wander the whole web. + parsed, err := url.Parse(targetURL) + if err != nil { + return nil, fmt.Errorf("parse target url %q: %w", targetURL, err) + } + host := parsed.Hostname() + if host == "" { + return nil, fmt.Errorf("target url %q has no host", targetURL) + } + + collector := colly.NewCollector( + colly.MaxDepth(depth), + colly.AllowedDomains(host), + ) + // reuse the shared client so proxy/cookie/-H/rate-limit are honored and the + // configured timeout applies to every fetch, robots.txt included. + collector.SetClient(httpx.Client(timeout)) + + // dedupe across the concurrent callbacks colly may fire. + var mu sync.Mutex + seen := make(map[string]struct{}) + + record := func(raw string) { + if raw == "" { + return + } + // keep the result set scoped to the target host; off-host assets + // (cdns, third-party links) are noise for an in-scope crawl. + if u, err := url.Parse(raw); err != nil || u.Hostname() != host { + return + } + mu.Lock() + if _, ok := seen[raw]; !ok { + seen[raw] = struct{}{} + log.Success("found: %s", output.Highlight.Render(raw)) + if logdir != "" { + _ = logger.Write(sanitizedURL, logdir, raw+"\n") + } + } + mu.Unlock() + } + + // links drive recursion; scripts/forms are recorded but not followed. + collector.OnHTML("a[href]", func(e *colly.HTMLElement) { + link := e.Request.AbsoluteURL(e.Attr("href")) + record(link) + // Visit enforces AllowedDomains/MaxDepth itself, so off-host or + // too-deep links are dropped without us re-checking. + _ = e.Request.Visit(link) + }) + collector.OnHTML("script[src]", func(e *colly.HTMLElement) { + record(e.Request.AbsoluteURL(e.Attr("src"))) + }) + collector.OnHTML("form[action]", func(e *colly.HTMLElement) { + record(e.Request.AbsoluteURL(e.Attr("action"))) + }) + + collector.OnError(func(_ *colly.Response, e error) { + // a single bad page shouldn't abort the crawl; note it and move on. + log.Warn("crawl error: %v", e) + }) + + if err := collector.Visit(targetURL); err != nil { + log.Error("crawl failed: %v", err) + return nil, fmt.Errorf("visit %q: %w", targetURL, err) + } + collector.Wait() + + result := &CrawlResult{URLs: sortedKeys(seen)} + + log.Complete(len(result.URLs), "urls") + return result, nil +} + +// sortedKeys returns the map keys in a stable order so output is deterministic. +func sortedKeys(set map[string]struct{}) []string { + keys := make([]string, 0, len(set)) + for k := range set { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} diff --git a/internal/scan/crawl_test.go b/internal/scan/crawl_test.go new file mode 100644 index 0000000..c0cc260 --- /dev/null +++ b/internal/scan/crawl_test.go @@ -0,0 +1,158 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// crawlSite serves a small link graph: +// +// / -> links /a and an off-host page; references script.js, form action /submit +// /a -> links /b +// /b -> links /c (only reachable at depth 3) +// /c -> leaf +func crawlSite(t *testing.T) *httptest.Server { + t.Helper() + + mux := http.NewServeMux() + // no robots restrictions; colly fetches this before crawling. + mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNotFound) + }) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/" { + http.NotFound(w, r) + return + } + _, _ = w.Write([]byte(` + a + off + +
+ `)) + }) + mux.HandleFunc("/a", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`b`)) + }) + mux.HandleFunc("/b", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`c`)) + }) + mux.HandleFunc("/c", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`leaf`)) + }) + + srv := httptest.NewServer(mux) + t.Cleanup(srv.Close) + return srv +} + +func urlsContain(urls []string, want string) bool { + for i := 0; i < len(urls); i++ { + if urls[i] == want { + return true + } + } + return false +} + +func TestCrawl_FindsLinkedPagesAndAssets(t *testing.T) { + srv := crawlSite(t) + + result, err := Crawl(srv.URL, 3, 5*time.Second, "") + if err != nil { + t.Fatalf("Crawl: %v", err) + } + + // links, scripts and forms must all be recorded, resolved to absolute urls. + wants := []string{ + srv.URL + "/a", + srv.URL + "/b", + srv.URL + "/c", + srv.URL + "/script.js", + srv.URL + "/submit", + } + for _, w := range wants { + if !urlsContain(result.URLs, w) { + t.Errorf("expected crawl to find %q, got %v", w, result.URLs) + } + } + + // AllowedDomains must keep the off-host link out of the result set. + if urlsContain(result.URLs, "https://off-host.example/x") { + t.Errorf("off-host link should be excluded, got %v", result.URLs) + } +} + +func TestCrawl_RespectsDepth(t *testing.T) { + srv := crawlSite(t) + + // depth 1: only links found on the root page (/a, /script.js, /submit) are + // recorded; /b lives one hop deeper and must not appear. + result, err := Crawl(srv.URL, 1, 5*time.Second, "") + if err != nil { + t.Fatalf("Crawl: %v", err) + } + + if !urlsContain(result.URLs, srv.URL+"/a") { + t.Errorf("depth 1 should find /a, got %v", result.URLs) + } + if urlsContain(result.URLs, srv.URL+"/b") { + t.Errorf("depth 1 must not reach /b, got %v", result.URLs) + } + if urlsContain(result.URLs, srv.URL+"/c") { + t.Errorf("depth 1 must not reach /c, got %v", result.URLs) + } +} + +func TestCrawl_Dedupes(t *testing.T) { + // a page that links the same target twice must yield a single entry. + mux := http.NewServeMux() + mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNotFound) + }) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/dup" { + _, _ = w.Write([]byte(`leaf`)) + return + } + _, _ = w.Write([]byte(`12`)) + }) + srv := httptest.NewServer(mux) + defer srv.Close() + + result, err := Crawl(srv.URL, 2, 5*time.Second, "") + if err != nil { + t.Fatalf("Crawl: %v", err) + } + + count := 0 + for _, u := range result.URLs { + if u == srv.URL+"/dup" { + count++ + } + } + if count != 1 { + t.Errorf("expected /dup once after dedupe, got %d in %v", count, result.URLs) + } +} + +func TestCrawl_ResultType(t *testing.T) { + r := &CrawlResult{} + if r.ResultType() != "crawl" { + t.Errorf("ResultType = %q, want crawl", r.ResultType()) + } +} diff --git a/internal/scan/passive.go b/internal/scan/passive.go new file mode 100644 index 0000000..8d02ced --- /dev/null +++ b/internal/scan/passive.go @@ -0,0 +1,266 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "github.com/dropalldatabases/sif/internal/httpx" + "github.com/dropalldatabases/sif/internal/logger" + "github.com/dropalldatabases/sif/internal/output" +) + +// source base urls are vars so tests can repoint them at local fixtures. they +// carry a trailing %s for the domain (or query) each source expects. +var ( + crtshBaseURL = "https://crt.sh/?q=%%25.%s&output=json" + certspotterBaseURL = "https://api.certspotter.com/v1/issuances?domain=%s&include_subdomains=true&expand=dns_names" + waybackBaseURL = "http://web.archive.org/cdx/search/cdx?url=*.%s/*&output=text&fl=original&collapse=urlkey" +) + +// cap the response we read from any one source so a hostile/huge feed can't +// exhaust memory. +const passiveMaxBytes = 25 * 1024 * 1024 + +// PassiveResult holds passively-gathered subdomains and historical urls. all +// data comes from third-party feeds; the target itself sees zero traffic. +type PassiveResult struct { + Subdomains []string `json:"subdomains"` + URLs []string `json:"urls"` +} + +func (r *PassiveResult) ResultType() string { return "passive" } + +// compile-time check so a result-type drift fails the build, not a run. +var _ ScanResult = (*PassiveResult)(nil) + +// crtshEntry is one certificate record from crt.sh; name_value may itself hold +// several newline-separated names. +type crtshEntry struct { + NameValue string `json:"name_value"` +} + +// certspotterEntry is one issuance from certspotter, expanded to dns names. +type certspotterEntry struct { + DNSNames []string `json:"dns_names"` +} + +// Passive performs keyless passive recon: subdomains from certificate +// transparency feeds plus historical urls from the wayback machine. each source +// fails independently so one feed being down doesn't sink the rest. +func Passive(targetURL string, timeout time.Duration, logdir string) (*PassiveResult, error) { + log := output.Module("PASSIVE") + log.Start() + + parsed, err := url.Parse(targetURL) + if err != nil { + return nil, fmt.Errorf("parse target url %q: %w", targetURL, err) + } + domain := parsed.Hostname() + if domain == "" { + return nil, fmt.Errorf("target url %q has no host", targetURL) + } + + sanitizedURL := stripScheme(targetURL) + if logdir != "" { + if err := logger.WriteHeader(sanitizedURL, logdir, "passive recon"); err != nil { + log.Error("error creating log file: %v", err) + return nil, fmt.Errorf("create passive log: %w", err) + } + } + + client := httpx.Client(timeout) + ctx := context.TODO() + + subSet := make(map[string]struct{}) + urlSet := make(map[string]struct{}) + + // crt.sh certificate transparency + if subs, err := fetchCrtsh(ctx, client, domain); err != nil { + log.Warn("crt.sh failed: %v", err) + } else { + addAll(subSet, subs) + } + + // certspotter certificate transparency + if subs, err := fetchCertspotter(ctx, client, domain); err != nil { + log.Warn("certspotter failed: %v", err) + } else { + addAll(subSet, subs) + } + + // wayback machine historical urls + if urls, err := fetchWayback(ctx, client, domain); err != nil { + log.Warn("wayback failed: %v", err) + } else { + addAll(urlSet, urls) + } + + result := &PassiveResult{ + Subdomains: sortedKeys(subSet), + URLs: sortedKeys(urlSet), + } + + logPassiveResults(log, sanitizedURL, logdir, result) + + log.Complete(len(result.Subdomains)+len(result.URLs), "discovered") + return result, nil +} + +// fetchCrtsh pulls subdomains from crt.sh's certificate transparency json. +func fetchCrtsh(ctx context.Context, client *http.Client, domain string) ([]string, error) { + body, err := passiveGET(ctx, client, fmt.Sprintf(crtshBaseURL, domain)) + if err != nil { + return nil, err + } + + var entries []crtshEntry + if err := json.Unmarshal(body, &entries); err != nil { + return nil, fmt.Errorf("parse crt.sh json: %w", err) + } + + var names []string + for i := 0; i < len(entries); i++ { + // name_value can pack several names separated by newlines. + for _, name := range strings.Split(entries[i].NameValue, "\n") { + if host := normalizeHost(name); host != "" { + names = append(names, host) + } + } + } + return names, nil +} + +// fetchCertspotter pulls subdomains from certspotter's keyless issuances feed. +func fetchCertspotter(ctx context.Context, client *http.Client, domain string) ([]string, error) { + body, err := passiveGET(ctx, client, fmt.Sprintf(certspotterBaseURL, domain)) + if err != nil { + return nil, err + } + + var entries []certspotterEntry + if err := json.Unmarshal(body, &entries); err != nil { + return nil, fmt.Errorf("parse certspotter json: %w", err) + } + + var names []string + for i := 0; i < len(entries); i++ { + for _, name := range entries[i].DNSNames { + if host := normalizeHost(name); host != "" { + names = append(names, host) + } + } + } + return names, nil +} + +// fetchWayback pulls historical urls from the wayback machine cdx index, which +// returns one original url per line. +func fetchWayback(ctx context.Context, client *http.Client, domain string) ([]string, error) { + body, err := passiveGET(ctx, client, fmt.Sprintf(waybackBaseURL, domain)) + if err != nil { + return nil, err + } + + var urls []string + scanner := bufio.NewScanner(strings.NewReader(string(body))) + // historical urls can be long; give the scanner a generous line buffer. + scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line != "" { + urls = append(urls, line) + } + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("read wayback lines: %w", err) + } + return urls, nil +} + +// passiveGET performs a bounded GET against a passive source. non-200 responses +// are treated as a source failure so the caller can skip it. +func passiveGET(ctx context.Context, client *http.Client, reqURL string) ([]byte, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, http.NoBody) + if err != nil { + return nil, fmt.Errorf("create request: %w", err) + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status %d", resp.StatusCode) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, passiveMaxBytes)) + if err != nil { + return nil, fmt.Errorf("read response: %w", err) + } + return body, nil +} + +// normalizeHost lowercases a name and strips a leading wildcard label so +// "*.example.com" and "EXAMPLE.com" collapse to one canonical host. +func normalizeHost(name string) string { + host := strings.ToLower(strings.TrimSpace(name)) + host = strings.TrimPrefix(host, "*.") + return host +} + +// addAll inserts every value into the dedupe set. +func addAll(set map[string]struct{}, values []string) { + for _, v := range values { + set[v] = struct{}{} + } +} + +func logPassiveResults(log *output.ModuleLogger, sanitizedURL, logdir string, result *PassiveResult) { + for _, sub := range result.Subdomains { + log.Success("subdomain: %s", output.Highlight.Render(sub)) + } + for _, u := range result.URLs { + log.Info("url: %s", u) + } + + if logdir == "" { + return + } + + var sb strings.Builder + if len(result.Subdomains) > 0 { + sb.WriteString(fmt.Sprintf("Subdomains (%d):\n", len(result.Subdomains))) + for _, sub := range result.Subdomains { + sb.WriteString(" " + sub + "\n") + } + } + if len(result.URLs) > 0 { + sb.WriteString(fmt.Sprintf("\nHistorical URLs (%d):\n", len(result.URLs))) + for _, u := range result.URLs { + sb.WriteString(" " + u + "\n") + } + } + _ = logger.Write(sanitizedURL, logdir, sb.String()) +} diff --git a/internal/scan/passive_test.go b/internal/scan/passive_test.go new file mode 100644 index 0000000..c6fb200 --- /dev/null +++ b/internal/scan/passive_test.go @@ -0,0 +1,163 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// sample feed payloads. crt.sh packs several names per name_value (newline +// separated) and emits wildcards; certspotter returns expanded dns_names. +const ( + crtshFixture = `[ + {"name_value": "www.example.com\n*.example.com"}, + {"name_value": "api.example.com"}, + {"name_value": "WWW.example.com"} + ]` + certspotterFixture = `[ + {"dns_names": ["mail.example.com", "api.example.com"]}, + {"dns_names": ["*.example.com"]} + ]` + waybackFixture = "http://example.com/\n" + + "http://example.com/login\n" + + "http://example.com/login\n" + + "\n" + + "http://example.com/admin\n" +) + +// fixtureServer serves each passive source on its own path and repoints the +// package base-url vars at it. the vars are restored on cleanup. +func fixtureServer(t *testing.T, crtsh, certspotter, wayback string) *httptest.Server { + t.Helper() + + mux := http.NewServeMux() + mux.HandleFunc("/crtsh", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(crtsh)) + }) + mux.HandleFunc("/certspotter", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(certspotter)) + }) + mux.HandleFunc("/wayback", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(wayback)) + }) + srv := httptest.NewServer(mux) + t.Cleanup(srv.Close) + + origCrtsh, origCertspotter, origWayback := crtshBaseURL, certspotterBaseURL, waybackBaseURL + // %s still consumes the domain so the production formatting path is exercised. + crtshBaseURL = srv.URL + "/crtsh?q=%s" + certspotterBaseURL = srv.URL + "/certspotter?domain=%s" + waybackBaseURL = srv.URL + "/wayback?url=%s" + t.Cleanup(func() { + crtshBaseURL, certspotterBaseURL, waybackBaseURL = origCrtsh, origCertspotter, origWayback + }) + + return srv +} + +func TestPassive_ParsesAndDedupes(t *testing.T) { + fixtureServer(t, crtshFixture, certspotterFixture, waybackFixture) + + result, err := Passive("https://example.com", 5*time.Second, "") + if err != nil { + t.Fatalf("Passive: %v", err) + } + + // wildcards stripped, case-folded, and merged across both ct feeds. + wantSubs := map[string]bool{ + "www.example.com": false, + "api.example.com": false, + "mail.example.com": false, + "example.com": false, // from "*.example.com" + } + for _, s := range result.Subdomains { + if _, ok := wantSubs[s]; !ok { + t.Errorf("unexpected subdomain %q", s) + continue + } + wantSubs[s] = true + } + for s, seen := range wantSubs { + if !seen { + t.Errorf("missing subdomain %q in %v", s, result.Subdomains) + } + } + if len(result.Subdomains) != len(wantSubs) { + t.Errorf("expected %d deduped subdomains, got %d: %v", len(wantSubs), len(result.Subdomains), result.Subdomains) + } + + // wayback: blank line dropped, duplicate /login collapsed. + wantURLs := map[string]bool{ + "http://example.com/": false, + "http://example.com/login": false, + "http://example.com/admin": false, + } + for _, u := range result.URLs { + if _, ok := wantURLs[u]; !ok { + t.Errorf("unexpected url %q", u) + continue + } + wantURLs[u] = true + } + if len(result.URLs) != len(wantURLs) { + t.Errorf("expected %d deduped urls, got %d: %v", len(wantURLs), len(result.URLs), result.URLs) + } +} + +func TestPassive_SourceFailureIsIsolated(t *testing.T) { + // crt.sh serves garbage that fails to parse; the other feeds must still + // produce results. + fixtureServer(t, "not json", certspotterFixture, waybackFixture) + + result, err := Passive("https://example.com", 5*time.Second, "") + if err != nil { + t.Fatalf("Passive should not fail when one source is down: %v", err) + } + + if len(result.Subdomains) == 0 { + t.Error("expected certspotter subdomains despite crt.sh failure") + } + if len(result.URLs) == 0 { + t.Error("expected wayback urls despite crt.sh failure") + } + if urlsContain(result.Subdomains, "www.example.com") { + t.Error("crt.sh-only subdomain leaked despite parse failure") + } +} + +func TestPassive_ResultType(t *testing.T) { + r := &PassiveResult{} + if r.ResultType() != "passive" { + t.Errorf("ResultType = %q, want passive", r.ResultType()) + } +} + +func TestNormalizeHost(t *testing.T) { + tests := []struct { + in string + want string + }{ + {"www.example.com", "www.example.com"}, + {"*.example.com", "example.com"}, + {" WWW.Example.COM ", "www.example.com"}, + {"", ""}, + } + for _, tt := range tests { + if got := normalizeHost(tt.in); got != tt.want { + t.Errorf("normalizeHost(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} diff --git a/man/sif.1 b/man/sif.1 index 4646086..bb160a7 100644 --- a/man/sif.1 +++ b/man/sif.1 @@ -98,6 +98,15 @@ reflected xss probe. .B \-framework framework detection with cve lookup. .TP +.B \-crawl +web crawler; spiders same\-host links, scripts and forms, respecting robots.txt. +.TP +.BR \-crawl\-depth " \fIn\fR" +max crawl recursion depth (default 2). +.TP +.B \-passive +passive subdomain and historical url discovery from third\-party feeds (zero traffic to the target). +.TP .B \-noscan skip the base url scan (robots.txt, etc). .SH OPTIONS diff --git a/sif.go b/sif.go index e1c6b09..bb6b705 100644 --- a/sif.go +++ b/sif.go @@ -421,6 +421,26 @@ func (app *App) Run() error { } } + if app.settings.Crawl { + result, err := scan.Crawl(url, app.settings.CrawlDepth, app.settings.Timeout, app.settings.LogDir) + if err != nil { + log.Errorf("Error while running web crawl: %s", err) + } else if result != nil { + moduleResults = append(moduleResults, NewModuleResult(result)) + scansRun = append(scansRun, "Crawl") + } + } + + if app.settings.Passive { + result, err := scan.Passive(url, app.settings.Timeout, app.settings.LogDir) + if err != nil { + log.Errorf("Error while running passive discovery: %s", err) + } else if result != nil { + moduleResults = append(moduleResults, NewModuleResult(result)) + scansRun = append(scansRun, "Passive") + } + } + // Load and run modules if app.settings.AllModules || app.settings.Modules != "" || app.settings.ModuleTags != "" { loader, err := modules.NewLoader()