diff --git a/README.md b/README.md index d780762..59359f3 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,9 @@ sif has a modular architecture. modules are defined in yaml and can be extended | `-redirect` | open redirect probe | | `-xss` | reflected xss probe | | `-framework` | framework detection with cve lookup | +| `-crawl` | web crawler (spider same-host links/scripts/forms) | +| `-crawl-depth` | max crawl recursion depth (default 2) | +| `-passive` | passive subdomain/url discovery (zero traffic to target) | ### http options diff --git a/docs/usage.md b/docs/usage.md index c58e16a..ae75048 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -186,6 +186,26 @@ export SHODAN_API_KEY=your-api-key ./sif -u https://example.com -framework ``` +### web crawler + +`-crawl` - spider the target, following same-host links, scripts and forms + +`-crawl-depth` - max recursion depth (default 2). respects robots.txt and stays on the target host. + +```bash +./sif -u https://example.com -crawl -crawl-depth 3 +``` + +### passive discovery + +`-passive` - gather subdomains from certificate transparency (crt.sh, certspotter) and historical urls from the wayback machine + +keyless and zero traffic to the target itself - all lookups hit third-party feeds. + +```bash +./sif -u https://example.com -passive +``` + ### whois lookup `-whois` - perform whois lookups diff --git a/go.mod b/go.mod index 4382257..35bb982 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/charmbracelet/glamour v0.10.0 github.com/charmbracelet/lipgloss v1.1.1-0.20250404203927-76690c660834 github.com/charmbracelet/log v1.0.0 + github.com/gocolly/colly/v2 v2.1.0 github.com/likexian/whois v1.15.7 github.com/projectdiscovery/goflags v0.1.74 github.com/projectdiscovery/nuclei/v3 v3.8.0 @@ -160,7 +161,6 @@ require ( github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect github.com/goccy/go-json v0.10.5 // indirect - github.com/gocolly/colly/v2 v2.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang-jwt/jwt/v4 v4.5.2 // indirect github.com/golang-jwt/jwt/v5 v5.2.2 // indirect diff --git a/internal/config/config.go b/internal/config/config.go index c692925..bdd2f95 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -50,6 +50,9 @@ type Settings struct { Redirect bool XSS bool Framework bool + Crawl bool + CrawlDepth int + Passive bool Modules string // Comma-separated list of module IDs to run ModuleTags string // Run modules matching these tags AllModules bool // Run all loaded modules @@ -65,6 +68,10 @@ type Settings struct { // "negative WaitGroup counter"; clamp the parsed value up to this. const minThreads = 1 +// defaultCrawlDepth bounds how far the spider recurses by default; deep enough +// to find linked pages without crawling an entire site. +const defaultCrawlDepth = 2 + const ( Nil goflags.EnumVariable = iota @@ -114,6 +121,9 @@ func Parse() *Settings { flagSet.BoolVar(&settings.Redirect, "redirect", false, "Enable open redirect probe"), flagSet.BoolVar(&settings.XSS, "xss", false, "Enable reflected XSS probe"), flagSet.BoolVar(&settings.Framework, "framework", false, "Enable framework detection"), + flagSet.BoolVar(&settings.Crawl, "crawl", false, "Enable web crawling (spider same-host links/scripts/forms)"), + flagSet.IntVar(&settings.CrawlDepth, "crawl-depth", defaultCrawlDepth, "Max crawl recursion depth"), + flagSet.BoolVar(&settings.Passive, "passive", false, "Enable passive subdomain/url discovery (zero traffic to target)"), ) flagSet.CreateGroup("runtime", "Runtime", diff --git a/internal/scan/crawl.go b/internal/scan/crawl.go new file mode 100644 index 0000000..79a5859 --- /dev/null +++ b/internal/scan/crawl.go @@ -0,0 +1,137 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "fmt" + "net/url" + "sort" + "sync" + "time" + + "github.com/gocolly/colly/v2" + + "github.com/dropalldatabases/sif/internal/httpx" + "github.com/dropalldatabases/sif/internal/logger" + "github.com/dropalldatabases/sif/internal/output" +) + +// CrawlResult holds the deduped set of urls discovered by the spider. +type CrawlResult struct { + URLs []string `json:"urls"` +} + +func (r *CrawlResult) ResultType() string { return "crawl" } + +// compile-time check so a result-type drift fails the build, not a run. +var _ ScanResult = (*CrawlResult)(nil) + +// Crawl spiders the target up to depth, following same-host links/scripts/forms. +// all traffic flows through the shared httpx client so proxy/headers/rate-limit +// apply, and robots.txt is respected (colly honors it by default). +func Crawl(targetURL string, depth int, timeout time.Duration, logdir string) (*CrawlResult, error) { + log := output.Module("CRAWL") + log.Start() + + sanitizedURL := stripScheme(targetURL) + + if logdir != "" { + if err := logger.WriteHeader(sanitizedURL, logdir, "web crawl"); err != nil { + log.Error("error creating log file: %v", err) + return nil, fmt.Errorf("create crawl log: %w", err) + } + } + + // the host bounds the crawl; without it colly would wander the whole web. + parsed, err := url.Parse(targetURL) + if err != nil { + return nil, fmt.Errorf("parse target url %q: %w", targetURL, err) + } + host := parsed.Hostname() + if host == "" { + return nil, fmt.Errorf("target url %q has no host", targetURL) + } + + collector := colly.NewCollector( + colly.MaxDepth(depth), + colly.AllowedDomains(host), + ) + // reuse the shared client so proxy/cookie/-H/rate-limit are honored and the + // configured timeout applies to every fetch, robots.txt included. + collector.SetClient(httpx.Client(timeout)) + + // dedupe across the concurrent callbacks colly may fire. + var mu sync.Mutex + seen := make(map[string]struct{}) + + record := func(raw string) { + if raw == "" { + return + } + // keep the result set scoped to the target host; off-host assets + // (cdns, third-party links) are noise for an in-scope crawl. + if u, err := url.Parse(raw); err != nil || u.Hostname() != host { + return + } + mu.Lock() + if _, ok := seen[raw]; !ok { + seen[raw] = struct{}{} + log.Success("found: %s", output.Highlight.Render(raw)) + if logdir != "" { + _ = logger.Write(sanitizedURL, logdir, raw+"\n") + } + } + mu.Unlock() + } + + // links drive recursion; scripts/forms are recorded but not followed. + collector.OnHTML("a[href]", func(e *colly.HTMLElement) { + link := e.Request.AbsoluteURL(e.Attr("href")) + record(link) + // Visit enforces AllowedDomains/MaxDepth itself, so off-host or + // too-deep links are dropped without us re-checking. + _ = e.Request.Visit(link) + }) + collector.OnHTML("script[src]", func(e *colly.HTMLElement) { + record(e.Request.AbsoluteURL(e.Attr("src"))) + }) + collector.OnHTML("form[action]", func(e *colly.HTMLElement) { + record(e.Request.AbsoluteURL(e.Attr("action"))) + }) + + collector.OnError(func(_ *colly.Response, e error) { + // a single bad page shouldn't abort the crawl; note it and move on. + log.Warn("crawl error: %v", e) + }) + + if err := collector.Visit(targetURL); err != nil { + log.Error("crawl failed: %v", err) + return nil, fmt.Errorf("visit %q: %w", targetURL, err) + } + collector.Wait() + + result := &CrawlResult{URLs: sortedKeys(seen)} + + log.Complete(len(result.URLs), "urls") + return result, nil +} + +// sortedKeys returns the map keys in a stable order so output is deterministic. +func sortedKeys(set map[string]struct{}) []string { + keys := make([]string, 0, len(set)) + for k := range set { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} diff --git a/internal/scan/crawl_test.go b/internal/scan/crawl_test.go new file mode 100644 index 0000000..c0cc260 --- /dev/null +++ b/internal/scan/crawl_test.go @@ -0,0 +1,158 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// crawlSite serves a small link graph: +// +// / -> links /a and an off-host page; references script.js, form action /submit +// /a -> links /b +// /b -> links /c (only reachable at depth 3) +// /c -> leaf +func crawlSite(t *testing.T) *httptest.Server { + t.Helper() + + mux := http.NewServeMux() + // no robots restrictions; colly fetches this before crawling. + mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNotFound) + }) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/" { + http.NotFound(w, r) + return + } + _, _ = w.Write([]byte(` + a + off + +
+ `)) + }) + mux.HandleFunc("/a", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`b`)) + }) + mux.HandleFunc("/b", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`c`)) + }) + mux.HandleFunc("/c", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`leaf`)) + }) + + srv := httptest.NewServer(mux) + t.Cleanup(srv.Close) + return srv +} + +func urlsContain(urls []string, want string) bool { + for i := 0; i < len(urls); i++ { + if urls[i] == want { + return true + } + } + return false +} + +func TestCrawl_FindsLinkedPagesAndAssets(t *testing.T) { + srv := crawlSite(t) + + result, err := Crawl(srv.URL, 3, 5*time.Second, "") + if err != nil { + t.Fatalf("Crawl: %v", err) + } + + // links, scripts and forms must all be recorded, resolved to absolute urls. + wants := []string{ + srv.URL + "/a", + srv.URL + "/b", + srv.URL + "/c", + srv.URL + "/script.js", + srv.URL + "/submit", + } + for _, w := range wants { + if !urlsContain(result.URLs, w) { + t.Errorf("expected crawl to find %q, got %v", w, result.URLs) + } + } + + // AllowedDomains must keep the off-host link out of the result set. + if urlsContain(result.URLs, "https://off-host.example/x") { + t.Errorf("off-host link should be excluded, got %v", result.URLs) + } +} + +func TestCrawl_RespectsDepth(t *testing.T) { + srv := crawlSite(t) + + // depth 1: only links found on the root page (/a, /script.js, /submit) are + // recorded; /b lives one hop deeper and must not appear. + result, err := Crawl(srv.URL, 1, 5*time.Second, "") + if err != nil { + t.Fatalf("Crawl: %v", err) + } + + if !urlsContain(result.URLs, srv.URL+"/a") { + t.Errorf("depth 1 should find /a, got %v", result.URLs) + } + if urlsContain(result.URLs, srv.URL+"/b") { + t.Errorf("depth 1 must not reach /b, got %v", result.URLs) + } + if urlsContain(result.URLs, srv.URL+"/c") { + t.Errorf("depth 1 must not reach /c, got %v", result.URLs) + } +} + +func TestCrawl_Dedupes(t *testing.T) { + // a page that links the same target twice must yield a single entry. + mux := http.NewServeMux() + mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNotFound) + }) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/dup" { + _, _ = w.Write([]byte(`leaf`)) + return + } + _, _ = w.Write([]byte(`12`)) + }) + srv := httptest.NewServer(mux) + defer srv.Close() + + result, err := Crawl(srv.URL, 2, 5*time.Second, "") + if err != nil { + t.Fatalf("Crawl: %v", err) + } + + count := 0 + for _, u := range result.URLs { + if u == srv.URL+"/dup" { + count++ + } + } + if count != 1 { + t.Errorf("expected /dup once after dedupe, got %d in %v", count, result.URLs) + } +} + +func TestCrawl_ResultType(t *testing.T) { + r := &CrawlResult{} + if r.ResultType() != "crawl" { + t.Errorf("ResultType = %q, want crawl", r.ResultType()) + } +} diff --git a/internal/scan/passive.go b/internal/scan/passive.go new file mode 100644 index 0000000..8d02ced --- /dev/null +++ b/internal/scan/passive.go @@ -0,0 +1,266 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "github.com/dropalldatabases/sif/internal/httpx" + "github.com/dropalldatabases/sif/internal/logger" + "github.com/dropalldatabases/sif/internal/output" +) + +// source base urls are vars so tests can repoint them at local fixtures. they +// carry a trailing %s for the domain (or query) each source expects. +var ( + crtshBaseURL = "https://crt.sh/?q=%%25.%s&output=json" + certspotterBaseURL = "https://api.certspotter.com/v1/issuances?domain=%s&include_subdomains=true&expand=dns_names" + waybackBaseURL = "http://web.archive.org/cdx/search/cdx?url=*.%s/*&output=text&fl=original&collapse=urlkey" +) + +// cap the response we read from any one source so a hostile/huge feed can't +// exhaust memory. +const passiveMaxBytes = 25 * 1024 * 1024 + +// PassiveResult holds passively-gathered subdomains and historical urls. all +// data comes from third-party feeds; the target itself sees zero traffic. +type PassiveResult struct { + Subdomains []string `json:"subdomains"` + URLs []string `json:"urls"` +} + +func (r *PassiveResult) ResultType() string { return "passive" } + +// compile-time check so a result-type drift fails the build, not a run. +var _ ScanResult = (*PassiveResult)(nil) + +// crtshEntry is one certificate record from crt.sh; name_value may itself hold +// several newline-separated names. +type crtshEntry struct { + NameValue string `json:"name_value"` +} + +// certspotterEntry is one issuance from certspotter, expanded to dns names. +type certspotterEntry struct { + DNSNames []string `json:"dns_names"` +} + +// Passive performs keyless passive recon: subdomains from certificate +// transparency feeds plus historical urls from the wayback machine. each source +// fails independently so one feed being down doesn't sink the rest. +func Passive(targetURL string, timeout time.Duration, logdir string) (*PassiveResult, error) { + log := output.Module("PASSIVE") + log.Start() + + parsed, err := url.Parse(targetURL) + if err != nil { + return nil, fmt.Errorf("parse target url %q: %w", targetURL, err) + } + domain := parsed.Hostname() + if domain == "" { + return nil, fmt.Errorf("target url %q has no host", targetURL) + } + + sanitizedURL := stripScheme(targetURL) + if logdir != "" { + if err := logger.WriteHeader(sanitizedURL, logdir, "passive recon"); err != nil { + log.Error("error creating log file: %v", err) + return nil, fmt.Errorf("create passive log: %w", err) + } + } + + client := httpx.Client(timeout) + ctx := context.TODO() + + subSet := make(map[string]struct{}) + urlSet := make(map[string]struct{}) + + // crt.sh certificate transparency + if subs, err := fetchCrtsh(ctx, client, domain); err != nil { + log.Warn("crt.sh failed: %v", err) + } else { + addAll(subSet, subs) + } + + // certspotter certificate transparency + if subs, err := fetchCertspotter(ctx, client, domain); err != nil { + log.Warn("certspotter failed: %v", err) + } else { + addAll(subSet, subs) + } + + // wayback machine historical urls + if urls, err := fetchWayback(ctx, client, domain); err != nil { + log.Warn("wayback failed: %v", err) + } else { + addAll(urlSet, urls) + } + + result := &PassiveResult{ + Subdomains: sortedKeys(subSet), + URLs: sortedKeys(urlSet), + } + + logPassiveResults(log, sanitizedURL, logdir, result) + + log.Complete(len(result.Subdomains)+len(result.URLs), "discovered") + return result, nil +} + +// fetchCrtsh pulls subdomains from crt.sh's certificate transparency json. +func fetchCrtsh(ctx context.Context, client *http.Client, domain string) ([]string, error) { + body, err := passiveGET(ctx, client, fmt.Sprintf(crtshBaseURL, domain)) + if err != nil { + return nil, err + } + + var entries []crtshEntry + if err := json.Unmarshal(body, &entries); err != nil { + return nil, fmt.Errorf("parse crt.sh json: %w", err) + } + + var names []string + for i := 0; i < len(entries); i++ { + // name_value can pack several names separated by newlines. + for _, name := range strings.Split(entries[i].NameValue, "\n") { + if host := normalizeHost(name); host != "" { + names = append(names, host) + } + } + } + return names, nil +} + +// fetchCertspotter pulls subdomains from certspotter's keyless issuances feed. +func fetchCertspotter(ctx context.Context, client *http.Client, domain string) ([]string, error) { + body, err := passiveGET(ctx, client, fmt.Sprintf(certspotterBaseURL, domain)) + if err != nil { + return nil, err + } + + var entries []certspotterEntry + if err := json.Unmarshal(body, &entries); err != nil { + return nil, fmt.Errorf("parse certspotter json: %w", err) + } + + var names []string + for i := 0; i < len(entries); i++ { + for _, name := range entries[i].DNSNames { + if host := normalizeHost(name); host != "" { + names = append(names, host) + } + } + } + return names, nil +} + +// fetchWayback pulls historical urls from the wayback machine cdx index, which +// returns one original url per line. +func fetchWayback(ctx context.Context, client *http.Client, domain string) ([]string, error) { + body, err := passiveGET(ctx, client, fmt.Sprintf(waybackBaseURL, domain)) + if err != nil { + return nil, err + } + + var urls []string + scanner := bufio.NewScanner(strings.NewReader(string(body))) + // historical urls can be long; give the scanner a generous line buffer. + scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line != "" { + urls = append(urls, line) + } + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("read wayback lines: %w", err) + } + return urls, nil +} + +// passiveGET performs a bounded GET against a passive source. non-200 responses +// are treated as a source failure so the caller can skip it. +func passiveGET(ctx context.Context, client *http.Client, reqURL string) ([]byte, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, http.NoBody) + if err != nil { + return nil, fmt.Errorf("create request: %w", err) + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status %d", resp.StatusCode) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, passiveMaxBytes)) + if err != nil { + return nil, fmt.Errorf("read response: %w", err) + } + return body, nil +} + +// normalizeHost lowercases a name and strips a leading wildcard label so +// "*.example.com" and "EXAMPLE.com" collapse to one canonical host. +func normalizeHost(name string) string { + host := strings.ToLower(strings.TrimSpace(name)) + host = strings.TrimPrefix(host, "*.") + return host +} + +// addAll inserts every value into the dedupe set. +func addAll(set map[string]struct{}, values []string) { + for _, v := range values { + set[v] = struct{}{} + } +} + +func logPassiveResults(log *output.ModuleLogger, sanitizedURL, logdir string, result *PassiveResult) { + for _, sub := range result.Subdomains { + log.Success("subdomain: %s", output.Highlight.Render(sub)) + } + for _, u := range result.URLs { + log.Info("url: %s", u) + } + + if logdir == "" { + return + } + + var sb strings.Builder + if len(result.Subdomains) > 0 { + sb.WriteString(fmt.Sprintf("Subdomains (%d):\n", len(result.Subdomains))) + for _, sub := range result.Subdomains { + sb.WriteString(" " + sub + "\n") + } + } + if len(result.URLs) > 0 { + sb.WriteString(fmt.Sprintf("\nHistorical URLs (%d):\n", len(result.URLs))) + for _, u := range result.URLs { + sb.WriteString(" " + u + "\n") + } + } + _ = logger.Write(sanitizedURL, logdir, sb.String()) +} diff --git a/internal/scan/passive_test.go b/internal/scan/passive_test.go new file mode 100644 index 0000000..c6fb200 --- /dev/null +++ b/internal/scan/passive_test.go @@ -0,0 +1,163 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// sample feed payloads. crt.sh packs several names per name_value (newline +// separated) and emits wildcards; certspotter returns expanded dns_names. +const ( + crtshFixture = `[ + {"name_value": "www.example.com\n*.example.com"}, + {"name_value": "api.example.com"}, + {"name_value": "WWW.example.com"} + ]` + certspotterFixture = `[ + {"dns_names": ["mail.example.com", "api.example.com"]}, + {"dns_names": ["*.example.com"]} + ]` + waybackFixture = "http://example.com/\n" + + "http://example.com/login\n" + + "http://example.com/login\n" + + "\n" + + "http://example.com/admin\n" +) + +// fixtureServer serves each passive source on its own path and repoints the +// package base-url vars at it. the vars are restored on cleanup. +func fixtureServer(t *testing.T, crtsh, certspotter, wayback string) *httptest.Server { + t.Helper() + + mux := http.NewServeMux() + mux.HandleFunc("/crtsh", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(crtsh)) + }) + mux.HandleFunc("/certspotter", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(certspotter)) + }) + mux.HandleFunc("/wayback", func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(wayback)) + }) + srv := httptest.NewServer(mux) + t.Cleanup(srv.Close) + + origCrtsh, origCertspotter, origWayback := crtshBaseURL, certspotterBaseURL, waybackBaseURL + // %s still consumes the domain so the production formatting path is exercised. + crtshBaseURL = srv.URL + "/crtsh?q=%s" + certspotterBaseURL = srv.URL + "/certspotter?domain=%s" + waybackBaseURL = srv.URL + "/wayback?url=%s" + t.Cleanup(func() { + crtshBaseURL, certspotterBaseURL, waybackBaseURL = origCrtsh, origCertspotter, origWayback + }) + + return srv +} + +func TestPassive_ParsesAndDedupes(t *testing.T) { + fixtureServer(t, crtshFixture, certspotterFixture, waybackFixture) + + result, err := Passive("https://example.com", 5*time.Second, "") + if err != nil { + t.Fatalf("Passive: %v", err) + } + + // wildcards stripped, case-folded, and merged across both ct feeds. + wantSubs := map[string]bool{ + "www.example.com": false, + "api.example.com": false, + "mail.example.com": false, + "example.com": false, // from "*.example.com" + } + for _, s := range result.Subdomains { + if _, ok := wantSubs[s]; !ok { + t.Errorf("unexpected subdomain %q", s) + continue + } + wantSubs[s] = true + } + for s, seen := range wantSubs { + if !seen { + t.Errorf("missing subdomain %q in %v", s, result.Subdomains) + } + } + if len(result.Subdomains) != len(wantSubs) { + t.Errorf("expected %d deduped subdomains, got %d: %v", len(wantSubs), len(result.Subdomains), result.Subdomains) + } + + // wayback: blank line dropped, duplicate /login collapsed. + wantURLs := map[string]bool{ + "http://example.com/": false, + "http://example.com/login": false, + "http://example.com/admin": false, + } + for _, u := range result.URLs { + if _, ok := wantURLs[u]; !ok { + t.Errorf("unexpected url %q", u) + continue + } + wantURLs[u] = true + } + if len(result.URLs) != len(wantURLs) { + t.Errorf("expected %d deduped urls, got %d: %v", len(wantURLs), len(result.URLs), result.URLs) + } +} + +func TestPassive_SourceFailureIsIsolated(t *testing.T) { + // crt.sh serves garbage that fails to parse; the other feeds must still + // produce results. + fixtureServer(t, "not json", certspotterFixture, waybackFixture) + + result, err := Passive("https://example.com", 5*time.Second, "") + if err != nil { + t.Fatalf("Passive should not fail when one source is down: %v", err) + } + + if len(result.Subdomains) == 0 { + t.Error("expected certspotter subdomains despite crt.sh failure") + } + if len(result.URLs) == 0 { + t.Error("expected wayback urls despite crt.sh failure") + } + if urlsContain(result.Subdomains, "www.example.com") { + t.Error("crt.sh-only subdomain leaked despite parse failure") + } +} + +func TestPassive_ResultType(t *testing.T) { + r := &PassiveResult{} + if r.ResultType() != "passive" { + t.Errorf("ResultType = %q, want passive", r.ResultType()) + } +} + +func TestNormalizeHost(t *testing.T) { + tests := []struct { + in string + want string + }{ + {"www.example.com", "www.example.com"}, + {"*.example.com", "example.com"}, + {" WWW.Example.COM ", "www.example.com"}, + {"", ""}, + } + for _, tt := range tests { + if got := normalizeHost(tt.in); got != tt.want { + t.Errorf("normalizeHost(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} diff --git a/man/sif.1 b/man/sif.1 index 4646086..bb160a7 100644 --- a/man/sif.1 +++ b/man/sif.1 @@ -98,6 +98,15 @@ reflected xss probe. .B \-framework framework detection with cve lookup. .TP +.B \-crawl +web crawler; spiders same\-host links, scripts and forms, respecting robots.txt. +.TP +.BR \-crawl\-depth " \fIn\fR" +max crawl recursion depth (default 2). +.TP +.B \-passive +passive subdomain and historical url discovery from third\-party feeds (zero traffic to the target). +.TP .B \-noscan skip the base url scan (robots.txt, etc). .SH OPTIONS diff --git a/sif.go b/sif.go index e1c6b09..bb6b705 100644 --- a/sif.go +++ b/sif.go @@ -421,6 +421,26 @@ func (app *App) Run() error { } } + if app.settings.Crawl { + result, err := scan.Crawl(url, app.settings.CrawlDepth, app.settings.Timeout, app.settings.LogDir) + if err != nil { + log.Errorf("Error while running web crawl: %s", err) + } else if result != nil { + moduleResults = append(moduleResults, NewModuleResult(result)) + scansRun = append(scansRun, "Crawl") + } + } + + if app.settings.Passive { + result, err := scan.Passive(url, app.settings.Timeout, app.settings.LogDir) + if err != nil { + log.Errorf("Error while running passive discovery: %s", err) + } else if result != nil { + moduleResults = append(moduleResults, NewModuleResult(result)) + scansRun = append(scansRun, "Passive") + } + } + // Load and run modules if app.settings.AllModules || app.settings.Modules != "" || app.settings.ModuleTags != "" { loader, err := modules.NewLoader()