From dbe79c495ee8bc7d7012a03a5fda9bb4648522b3 Mon Sep 17 00:00:00 2001 From: vmfunc Date: Tue, 9 Jun 2026 17:57:42 -0700 Subject: [PATCH] feat(scan): add web crawler and passive subdomain/url discovery -crawl spiders same-host links/scripts/forms through the shared httpx client so proxy/headers/rate-limit and robots.txt are honored, bounded by -crawl-depth. -passive pulls subdomains from keyless ct feeds (crt.sh, certspotter) and historical urls from wayback, each source isolated so one feed being down doesn't sink the rest and the target sees no traffic. --- README.md | 3 + docs/usage.md | 20 +++ go.mod | 2 +- internal/config/config.go | 10 ++ internal/scan/crawl.go | 137 +++++++++++++++++ internal/scan/crawl_test.go | 158 ++++++++++++++++++++ internal/scan/passive.go | 266 ++++++++++++++++++++++++++++++++++ internal/scan/passive_test.go | 163 +++++++++++++++++++++ man/sif.1 | 9 ++ sif.go | 20 +++ 10 files changed, 787 insertions(+), 1 deletion(-) create mode 100644 internal/scan/crawl.go create mode 100644 internal/scan/crawl_test.go create mode 100644 internal/scan/passive.go create mode 100644 internal/scan/passive_test.go diff --git a/README.md b/README.md index d780762..59359f3 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,9 @@ sif has a modular architecture. modules are defined in yaml and can be extended | `-redirect` | open redirect probe | | `-xss` | reflected xss probe | | `-framework` | framework detection with cve lookup | +| `-crawl` | web crawler (spider same-host links/scripts/forms) | +| `-crawl-depth` | max crawl recursion depth (default 2) | +| `-passive` | passive subdomain/url discovery (zero traffic to target) | ### http options diff --git a/docs/usage.md b/docs/usage.md index c58e16a..ae75048 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -186,6 +186,26 @@ export SHODAN_API_KEY=your-api-key ./sif -u https://example.com -framework ``` +### web crawler + +`-crawl` - spider the target, following same-host links, scripts and forms + +`-crawl-depth` - max recursion depth (default 2). respects robots.txt and stays on the target host. + +```bash +./sif -u https://example.com -crawl -crawl-depth 3 +``` + +### passive discovery + +`-passive` - gather subdomains from certificate transparency (crt.sh, certspotter) and historical urls from the wayback machine + +keyless and zero traffic to the target itself - all lookups hit third-party feeds. + +```bash +./sif -u https://example.com -passive +``` + ### whois lookup `-whois` - perform whois lookups diff --git a/go.mod b/go.mod index 4382257..35bb982 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/charmbracelet/glamour v0.10.0 github.com/charmbracelet/lipgloss v1.1.1-0.20250404203927-76690c660834 github.com/charmbracelet/log v1.0.0 + github.com/gocolly/colly/v2 v2.1.0 github.com/likexian/whois v1.15.7 github.com/projectdiscovery/goflags v0.1.74 github.com/projectdiscovery/nuclei/v3 v3.8.0 @@ -160,7 +161,6 @@ require ( github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect github.com/goccy/go-json v0.10.5 // indirect - github.com/gocolly/colly/v2 v2.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang-jwt/jwt/v4 v4.5.2 // indirect github.com/golang-jwt/jwt/v5 v5.2.2 // indirect diff --git a/internal/config/config.go b/internal/config/config.go index c692925..bdd2f95 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -50,6 +50,9 @@ type Settings struct { Redirect bool XSS bool Framework bool + Crawl bool + CrawlDepth int + Passive bool Modules string // Comma-separated list of module IDs to run ModuleTags string // Run modules matching these tags AllModules bool // Run all loaded modules @@ -65,6 +68,10 @@ type Settings struct { // "negative WaitGroup counter"; clamp the parsed value up to this. const minThreads = 1 +// defaultCrawlDepth bounds how far the spider recurses by default; deep enough +// to find linked pages without crawling an entire site. +const defaultCrawlDepth = 2 + const ( Nil goflags.EnumVariable = iota @@ -114,6 +121,9 @@ func Parse() *Settings { flagSet.BoolVar(&settings.Redirect, "redirect", false, "Enable open redirect probe"), flagSet.BoolVar(&settings.XSS, "xss", false, "Enable reflected XSS probe"), flagSet.BoolVar(&settings.Framework, "framework", false, "Enable framework detection"), + flagSet.BoolVar(&settings.Crawl, "crawl", false, "Enable web crawling (spider same-host links/scripts/forms)"), + flagSet.IntVar(&settings.CrawlDepth, "crawl-depth", defaultCrawlDepth, "Max crawl recursion depth"), + flagSet.BoolVar(&settings.Passive, "passive", false, "Enable passive subdomain/url discovery (zero traffic to target)"), ) flagSet.CreateGroup("runtime", "Runtime", diff --git a/internal/scan/crawl.go b/internal/scan/crawl.go new file mode 100644 index 0000000..79a5859 --- /dev/null +++ b/internal/scan/crawl.go @@ -0,0 +1,137 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "fmt" + "net/url" + "sort" + "sync" + "time" + + "github.com/gocolly/colly/v2" + + "github.com/dropalldatabases/sif/internal/httpx" + "github.com/dropalldatabases/sif/internal/logger" + "github.com/dropalldatabases/sif/internal/output" +) + +// CrawlResult holds the deduped set of urls discovered by the spider. +type CrawlResult struct { + URLs []string `json:"urls"` +} + +func (r *CrawlResult) ResultType() string { return "crawl" } + +// compile-time check so a result-type drift fails the build, not a run. +var _ ScanResult = (*CrawlResult)(nil) + +// Crawl spiders the target up to depth, following same-host links/scripts/forms. +// all traffic flows through the shared httpx client so proxy/headers/rate-limit +// apply, and robots.txt is respected (colly honors it by default). +func Crawl(targetURL string, depth int, timeout time.Duration, logdir string) (*CrawlResult, error) { + log := output.Module("CRAWL") + log.Start() + + sanitizedURL := stripScheme(targetURL) + + if logdir != "" { + if err := logger.WriteHeader(sanitizedURL, logdir, "web crawl"); err != nil { + log.Error("error creating log file: %v", err) + return nil, fmt.Errorf("create crawl log: %w", err) + } + } + + // the host bounds the crawl; without it colly would wander the whole web. + parsed, err := url.Parse(targetURL) + if err != nil { + return nil, fmt.Errorf("parse target url %q: %w", targetURL, err) + } + host := parsed.Hostname() + if host == "" { + return nil, fmt.Errorf("target url %q has no host", targetURL) + } + + collector := colly.NewCollector( + colly.MaxDepth(depth), + colly.AllowedDomains(host), + ) + // reuse the shared client so proxy/cookie/-H/rate-limit are honored and the + // configured timeout applies to every fetch, robots.txt included. + collector.SetClient(httpx.Client(timeout)) + + // dedupe across the concurrent callbacks colly may fire. + var mu sync.Mutex + seen := make(map[string]struct{}) + + record := func(raw string) { + if raw == "" { + return + } + // keep the result set scoped to the target host; off-host assets + // (cdns, third-party links) are noise for an in-scope crawl. + if u, err := url.Parse(raw); err != nil || u.Hostname() != host { + return + } + mu.Lock() + if _, ok := seen[raw]; !ok { + seen[raw] = struct{}{} + log.Success("found: %s", output.Highlight.Render(raw)) + if logdir != "" { + _ = logger.Write(sanitizedURL, logdir, raw+"\n") + } + } + mu.Unlock() + } + + // links drive recursion; scripts/forms are recorded but not followed. + collector.OnHTML("a[href]", func(e *colly.HTMLElement) { + link := e.Request.AbsoluteURL(e.Attr("href")) + record(link) + // Visit enforces AllowedDomains/MaxDepth itself, so off-host or + // too-deep links are dropped without us re-checking. + _ = e.Request.Visit(link) + }) + collector.OnHTML("script[src]", func(e *colly.HTMLElement) { + record(e.Request.AbsoluteURL(e.Attr("src"))) + }) + collector.OnHTML("form[action]", func(e *colly.HTMLElement) { + record(e.Request.AbsoluteURL(e.Attr("action"))) + }) + + collector.OnError(func(_ *colly.Response, e error) { + // a single bad page shouldn't abort the crawl; note it and move on. + log.Warn("crawl error: %v", e) + }) + + if err := collector.Visit(targetURL); err != nil { + log.Error("crawl failed: %v", err) + return nil, fmt.Errorf("visit %q: %w", targetURL, err) + } + collector.Wait() + + result := &CrawlResult{URLs: sortedKeys(seen)} + + log.Complete(len(result.URLs), "urls") + return result, nil +} + +// sortedKeys returns the map keys in a stable order so output is deterministic. +func sortedKeys(set map[string]struct{}) []string { + keys := make([]string, 0, len(set)) + for k := range set { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} diff --git a/internal/scan/crawl_test.go b/internal/scan/crawl_test.go new file mode 100644 index 0000000..c0cc260 --- /dev/null +++ b/internal/scan/crawl_test.go @@ -0,0 +1,158 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// crawlSite serves a small link graph: +// +// / -> links /a and an off-host page; references script.js, form action /submit +// /a -> links /b +// /b -> links /c (only reachable at depth 3) +// /c -> leaf +func crawlSite(t *testing.T) *httptest.Server { + t.Helper() + + mux := http.NewServeMux() + // no robots restrictions; colly fetches this before crawling. + mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNotFound) + }) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/" { + http.NotFound(w, r) + return + } + _, _ = w.Write([]byte(` + a + off + +
+ `)) + }) + mux.HandleFunc("/a", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`b`)) + }) + mux.HandleFunc("/b", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`c`)) + }) + mux.HandleFunc("/c", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`leaf`)) + }) + + srv := httptest.NewServer(mux) + t.Cleanup(srv.Close) + return srv +} + +func urlsContain(urls []string, want string) bool { + for i := 0; i < len(urls); i++ { + if urls[i] == want { + return true + } + } + return false +} + +func TestCrawl_FindsLinkedPagesAndAssets(t *testing.T) { + srv := crawlSite(t) + + result, err := Crawl(srv.URL, 3, 5*time.Second, "") + if err != nil { + t.Fatalf("Crawl: %v", err) + } + + // links, scripts and forms must all be recorded, resolved to absolute urls. + wants := []string{ + srv.URL + "/a", + srv.URL + "/b", + srv.URL + "/c", + srv.URL + "/script.js", + srv.URL + "/submit", + } + for _, w := range wants { + if !urlsContain(result.URLs, w) { + t.Errorf("expected crawl to find %q, got %v", w, result.URLs) + } + } + + // AllowedDomains must keep the off-host link out of the result set. + if urlsContain(result.URLs, "https://off-host.example/x") { + t.Errorf("off-host link should be excluded, got %v", result.URLs) + } +} + +func TestCrawl_RespectsDepth(t *testing.T) { + srv := crawlSite(t) + + // depth 1: only links found on the root page (/a, /script.js, /submit) are + // recorded; /b lives one hop deeper and must not appear. + result, err := Crawl(srv.URL, 1, 5*time.Second, "") + if err != nil { + t.Fatalf("Crawl: %v", err) + } + + if !urlsContain(result.URLs, srv.URL+"/a") { + t.Errorf("depth 1 should find /a, got %v", result.URLs) + } + if urlsContain(result.URLs, srv.URL+"/b") { + t.Errorf("depth 1 must not reach /b, got %v", result.URLs) + } + if urlsContain(result.URLs, srv.URL+"/c") { + t.Errorf("depth 1 must not reach /c, got %v", result.URLs) + } +} + +func TestCrawl_Dedupes(t *testing.T) { + // a page that links the same target twice must yield a single entry. + mux := http.NewServeMux() + mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNotFound) + }) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/dup" { + _, _ = w.Write([]byte(`leaf`)) + return + } + _, _ = w.Write([]byte(`12`)) + }) + srv := httptest.NewServer(mux) + defer srv.Close() + + result, err := Crawl(srv.URL, 2, 5*time.Second, "") + if err != nil { + t.Fatalf("Crawl: %v", err) + } + + count := 0 + for _, u := range result.URLs { + if u == srv.URL+"/dup" { + count++ + } + } + if count != 1 { + t.Errorf("expected /dup once after dedupe, got %d in %v", count, result.URLs) + } +} + +func TestCrawl_ResultType(t *testing.T) { + r := &CrawlResult{} + if r.ResultType() != "crawl" { + t.Errorf("ResultType = %q, want crawl", r.ResultType()) + } +} diff --git a/internal/scan/passive.go b/internal/scan/passive.go new file mode 100644 index 0000000..8d02ced --- /dev/null +++ b/internal/scan/passive.go @@ -0,0 +1,266 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "github.com/dropalldatabases/sif/internal/httpx" + "github.com/dropalldatabases/sif/internal/logger" + "github.com/dropalldatabases/sif/internal/output" +) + +// source base urls are vars so tests can repoint them at local fixtures. they +// carry a trailing %s for the domain (or query) each source expects. +var ( + crtshBaseURL = "https://crt.sh/?q=%%25.%s&output=json" + certspotterBaseURL = "https://api.certspotter.com/v1/issuances?domain=%s&include_subdomains=true&expand=dns_names" + waybackBaseURL = "http://web.archive.org/cdx/search/cdx?url=*.%s/*&output=text&fl=original&collapse=urlkey" +) + +// cap the response we read from any one source so a hostile/huge feed can't +// exhaust memory. +const passiveMaxBytes = 25 * 1024 * 1024 + +// PassiveResult holds passively-gathered subdomains and historical urls. all +// data comes from third-party feeds; the target itself sees zero traffic. +type PassiveResult struct { + Subdomains []string `json:"subdomains"` + URLs []string `json:"urls"` +} + +func (r *PassiveResult) ResultType() string { return "passive" } + +// compile-time check so a result-type drift fails the build, not a run. +var _ ScanResult = (*PassiveResult)(nil) + +// crtshEntry is one certificate record from crt.sh; name_value may itself hold +// several newline-separated names. +type crtshEntry struct { + NameValue string `json:"name_value"` +} + +// certspotterEntry is one issuance from certspotter, expanded to dns names. +type certspotterEntry struct { + DNSNames []string `json:"dns_names"` +} + +// Passive performs keyless passive recon: subdomains from certificate +// transparency feeds plus historical urls from the wayback machine. each source +// fails independently so one feed being down doesn't sink the rest. +func Passive(targetURL string, timeout time.Duration, logdir string) (*PassiveResult, error) { + log := output.Module("PASSIVE") + log.Start() + + parsed, err := url.Parse(targetURL) + if err != nil { + return nil, fmt.Errorf("parse target url %q: %w", targetURL, err) + } + domain := parsed.Hostname() + if domain == "" { + return nil, fmt.Errorf("target url %q has no host", targetURL) + } + + sanitizedURL := stripScheme(targetURL) + if logdir != "" { + if err := logger.WriteHeader(sanitizedURL, logdir, "passive recon"); err != nil { + log.Error("error creating log file: %v", err) + return nil, fmt.Errorf("create passive log: %w", err) + } + } + + client := httpx.Client(timeout) + ctx := context.TODO() + + subSet := make(map[string]struct{}) + urlSet := make(map[string]struct{}) + + // crt.sh certificate transparency + if subs, err := fetchCrtsh(ctx, client, domain); err != nil { + log.Warn("crt.sh failed: %v", err) + } else { + addAll(subSet, subs) + } + + // certspotter certificate transparency + if subs, err := fetchCertspotter(ctx, client, domain); err != nil { + log.Warn("certspotter failed: %v", err) + } else { + addAll(subSet, subs) + } + + // wayback machine historical urls + if urls, err := fetchWayback(ctx, client, domain); err != nil { + log.Warn("wayback failed: %v", err) + } else { + addAll(urlSet, urls) + } + + result := &PassiveResult{ + Subdomains: sortedKeys(subSet), + URLs: sortedKeys(urlSet), + } + + logPassiveResults(log, sanitizedURL, logdir, result) + + log.Complete(len(result.Subdomains)+len(result.URLs), "discovered") + return result, nil +} + +// fetchCrtsh pulls subdomains from crt.sh's certificate transparency json. +func fetchCrtsh(ctx context.Context, client *http.Client, domain string) ([]string, error) { + body, err := passiveGET(ctx, client, fmt.Sprintf(crtshBaseURL, domain)) + if err != nil { + return nil, err + } + + var entries []crtshEntry + if err := json.Unmarshal(body, &entries); err != nil { + return nil, fmt.Errorf("parse crt.sh json: %w", err) + } + + var names []string + for i := 0; i < len(entries); i++ { + // name_value can pack several names separated by newlines. + for _, name := range strings.Split(entries[i].NameValue, "\n") { + if host := normalizeHost(name); host != "" { + names = append(names, host) + } + } + } + return names, nil +} + +// fetchCertspotter pulls subdomains from certspotter's keyless issuances feed. +func fetchCertspotter(ctx context.Context, client *http.Client, domain string) ([]string, error) { + body, err := passiveGET(ctx, client, fmt.Sprintf(certspotterBaseURL, domain)) + if err != nil { + return nil, err + } + + var entries []certspotterEntry + if err := json.Unmarshal(body, &entries); err != nil { + return nil, fmt.Errorf("parse certspotter json: %w", err) + } + + var names []string + for i := 0; i < len(entries); i++ { + for _, name := range entries[i].DNSNames { + if host := normalizeHost(name); host != "" { + names = append(names, host) + } + } + } + return names, nil +} + +// fetchWayback pulls historical urls from the wayback machine cdx index, which +// returns one original url per line. +func fetchWayback(ctx context.Context, client *http.Client, domain string) ([]string, error) { + body, err := passiveGET(ctx, client, fmt.Sprintf(waybackBaseURL, domain)) + if err != nil { + return nil, err + } + + var urls []string + scanner := bufio.NewScanner(strings.NewReader(string(body))) + // historical urls can be long; give the scanner a generous line buffer. + scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line != "" { + urls = append(urls, line) + } + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("read wayback lines: %w", err) + } + return urls, nil +} + +// passiveGET performs a bounded GET against a passive source. non-200 responses +// are treated as a source failure so the caller can skip it. +func passiveGET(ctx context.Context, client *http.Client, reqURL string) ([]byte, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, http.NoBody) + if err != nil { + return nil, fmt.Errorf("create request: %w", err) + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status %d", resp.StatusCode) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, passiveMaxBytes)) + if err != nil { + return nil, fmt.Errorf("read response: %w", err) + } + return body, nil +} + +// normalizeHost lowercases a name and strips a leading wildcard label so +// "*.example.com" and "EXAMPLE.com" collapse to one canonical host. +func normalizeHost(name string) string { + host := strings.ToLower(strings.TrimSpace(name)) + host = strings.TrimPrefix(host, "*.") + return host +} + +// addAll inserts every value into the dedupe set. +func addAll(set map[string]struct{}, values []string) { + for _, v := range values { + set[v] = struct{}{} + } +} + +func logPassiveResults(log *output.ModuleLogger, sanitizedURL, logdir string, result *PassiveResult) { + for _, sub := range result.Subdomains { + log.Success("subdomain: %s", output.Highlight.Render(sub)) + } + for _, u := range result.URLs { + log.Info("url: %s", u) + } + + if logdir == "" { + return + } + + var sb strings.Builder + if len(result.Subdomains) > 0 { + sb.WriteString(fmt.Sprintf("Subdomains (%d):\n", len(result.Subdomains))) + for _, sub := range result.Subdomains { + sb.WriteString(" " + sub + "\n") + } + } + if len(result.URLs) > 0 { + sb.WriteString(fmt.Sprintf("\nHistorical URLs (%d):\n", len(result.URLs))) + for _, u := range result.URLs { + sb.WriteString(" " + u + "\n") + } + } + _ = logger.Write(sanitizedURL, logdir, sb.String()) +} diff --git a/internal/scan/passive_test.go b/internal/scan/passive_test.go new file mode 100644 index 0000000..c6fb200 --- /dev/null +++ b/internal/scan/passive_test.go @@ -0,0 +1,163 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// sample feed payloads. crt.sh packs several names per name_value (newline +// separated) and emits wildcards; certspotter returns expanded dns_names. +const ( + crtshFixture = `[ + {"name_value": "www.example.com\n*.example.com"}, + {"name_value": "api.example.com"}, + {"name_value": "WWW.example.com"} + ]` + certspotterFixture = `[ + {"dns_names": ["mail.example.com", "api.example.com"]}, + {"dns_names": ["*.example.com"]} + ]` + waybackFixture = "http://example.com/\n" + + "http://example.com/login\n" + + "http://example.com/login\n" + + "\n" + + "http://example.com/admin\n" +) + +// fixtureServer serves each passive source on its own path and repoints the +// package base-url vars at it. the vars are restored on cleanup. +func fixtureServer(t *testing.T, crtsh, certspotter, wayback string) *httptest.Server { + t.Helper() + + mux := http.NewServeMux() + mux.HandleFunc("/crtsh", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(crtsh)) + }) + mux.HandleFunc("/certspotter", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(certspotter)) + }) + mux.HandleFunc("/wayback", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(wayback)) + }) + srv := httptest.NewServer(mux) + t.Cleanup(srv.Close) + + origCrtsh, origCertspotter, origWayback := crtshBaseURL, certspotterBaseURL, waybackBaseURL + // %s still consumes the domain so the production formatting path is exercised. + crtshBaseURL = srv.URL + "/crtsh?q=%s" + certspotterBaseURL = srv.URL + "/certspotter?domain=%s" + waybackBaseURL = srv.URL + "/wayback?url=%s" + t.Cleanup(func() { + crtshBaseURL, certspotterBaseURL, waybackBaseURL = origCrtsh, origCertspotter, origWayback + }) + + return srv +} + +func TestPassive_ParsesAndDedupes(t *testing.T) { + fixtureServer(t, crtshFixture, certspotterFixture, waybackFixture) + + result, err := Passive("https://example.com", 5*time.Second, "") + if err != nil { + t.Fatalf("Passive: %v", err) + } + + // wildcards stripped, case-folded, and merged across both ct feeds. + wantSubs := map[string]bool{ + "www.example.com": false, + "api.example.com": false, + "mail.example.com": false, + "example.com": false, // from "*.example.com" + } + for _, s := range result.Subdomains { + if _, ok := wantSubs[s]; !ok { + t.Errorf("unexpected subdomain %q", s) + continue + } + wantSubs[s] = true + } + for s, seen := range wantSubs { + if !seen { + t.Errorf("missing subdomain %q in %v", s, result.Subdomains) + } + } + if len(result.Subdomains) != len(wantSubs) { + t.Errorf("expected %d deduped subdomains, got %d: %v", len(wantSubs), len(result.Subdomains), result.Subdomains) + } + + // wayback: blank line dropped, duplicate /login collapsed. + wantURLs := map[string]bool{ + "http://example.com/": false, + "http://example.com/login": false, + "http://example.com/admin": false, + } + for _, u := range result.URLs { + if _, ok := wantURLs[u]; !ok { + t.Errorf("unexpected url %q", u) + continue + } + wantURLs[u] = true + } + if len(result.URLs) != len(wantURLs) { + t.Errorf("expected %d deduped urls, got %d: %v", len(wantURLs), len(result.URLs), result.URLs) + } +} + +func TestPassive_SourceFailureIsIsolated(t *testing.T) { + // crt.sh serves garbage that fails to parse; the other feeds must still + // produce results. + fixtureServer(t, "not json", certspotterFixture, waybackFixture) + + result, err := Passive("https://example.com", 5*time.Second, "") + if err != nil { + t.Fatalf("Passive should not fail when one source is down: %v", err) + } + + if len(result.Subdomains) == 0 { + t.Error("expected certspotter subdomains despite crt.sh failure") + } + if len(result.URLs) == 0 { + t.Error("expected wayback urls despite crt.sh failure") + } + if urlsContain(result.Subdomains, "www.example.com") { + t.Error("crt.sh-only subdomain leaked despite parse failure") + } +} + +func TestPassive_ResultType(t *testing.T) { + r := &PassiveResult{} + if r.ResultType() != "passive" { + t.Errorf("ResultType = %q, want passive", r.ResultType()) + } +} + +func TestNormalizeHost(t *testing.T) { + tests := []struct { + in string + want string + }{ + {"www.example.com", "www.example.com"}, + {"*.example.com", "example.com"}, + {" WWW.Example.COM ", "www.example.com"}, + {"", ""}, + } + for _, tt := range tests { + if got := normalizeHost(tt.in); got != tt.want { + t.Errorf("normalizeHost(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} diff --git a/man/sif.1 b/man/sif.1 index 4646086..bb160a7 100644 --- a/man/sif.1 +++ b/man/sif.1 @@ -98,6 +98,15 @@ reflected xss probe. .B \-framework framework detection with cve lookup. .TP +.B \-crawl +web crawler; spiders same\-host links, scripts and forms, respecting robots.txt. +.TP +.BR \-crawl\-depth " \fIn\fR" +max crawl recursion depth (default 2). +.TP +.B \-passive +passive subdomain and historical url discovery from third\-party feeds (zero traffic to the target). +.TP .B \-noscan skip the base url scan (robots.txt, etc). .SH OPTIONS diff --git a/sif.go b/sif.go index e1c6b09..bb6b705 100644 --- a/sif.go +++ b/sif.go @@ -421,6 +421,26 @@ func (app *App) Run() error { } } + if app.settings.Crawl { + result, err := scan.Crawl(url, app.settings.CrawlDepth, app.settings.Timeout, app.settings.LogDir) + if err != nil { + log.Errorf("Error while running web crawl: %s", err) + } else if result != nil { + moduleResults = append(moduleResults, NewModuleResult(result)) + scansRun = append(scansRun, "Crawl") + } + } + + if app.settings.Passive { + result, err := scan.Passive(url, app.settings.Timeout, app.settings.LogDir) + if err != nil { + log.Errorf("Error while running passive discovery: %s", err) + } else if result != nil { + moduleResults = append(moduleResults, NewModuleResult(result)) + scansRun = append(scansRun, "Passive") + } + } + // Load and run modules if app.settings.AllModules || app.settings.Modules != "" || app.settings.ModuleTags != "" { loader, err := modules.NewLoader()