feat(scan): add web crawler and passive subdomain/url discovery

-crawl spiders same-host links/scripts/forms through the shared httpx
client so proxy/headers/rate-limit and robots.txt are honored, bounded
by -crawl-depth. -passive pulls subdomains from keyless ct feeds (crt.sh,
certspotter) and historical urls from wayback, each source isolated so
one feed being down doesn't sink the rest and the target sees no traffic.
This commit is contained in:
vmfunc
2026-06-09 17:57:42 -07:00
committed by vmfunc
parent 9401aa669e
commit dbe79c495e
10 changed files with 787 additions and 1 deletions
+3
View File
@@ -177,6 +177,9 @@ sif has a modular architecture. modules are defined in yaml and can be extended
| `-redirect` | open redirect probe |
| `-xss` | reflected xss probe |
| `-framework` | framework detection with cve lookup |
| `-crawl` | web crawler (spider same-host links/scripts/forms) |
| `-crawl-depth` | max crawl recursion depth (default 2) |
| `-passive` | passive subdomain/url discovery (zero traffic to target) |
### http options
+20
View File
@@ -186,6 +186,26 @@ export SHODAN_API_KEY=your-api-key
./sif -u https://example.com -framework
```
### web crawler
`-crawl` - spider the target, following same-host links, scripts and forms
`-crawl-depth` - max recursion depth (default 2). respects robots.txt and stays on the target host.
```bash
./sif -u https://example.com -crawl -crawl-depth 3
```
### passive discovery
`-passive` - gather subdomains from certificate transparency (crt.sh, certspotter) and historical urls from the wayback machine
keyless and zero traffic to the target itself - all lookups hit third-party feeds.
```bash
./sif -u https://example.com -passive
```
### whois lookup
`-whois` - perform whois lookups
+1 -1
View File
@@ -7,6 +7,7 @@ require (
github.com/charmbracelet/glamour v0.10.0
github.com/charmbracelet/lipgloss v1.1.1-0.20250404203927-76690c660834
github.com/charmbracelet/log v1.0.0
github.com/gocolly/colly/v2 v2.1.0
github.com/likexian/whois v1.15.7
github.com/projectdiscovery/goflags v0.1.74
github.com/projectdiscovery/nuclei/v3 v3.8.0
@@ -160,7 +161,6 @@ require (
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/goccy/go-json v0.10.5 // indirect
github.com/gocolly/colly/v2 v2.1.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang-jwt/jwt/v4 v4.5.2 // indirect
github.com/golang-jwt/jwt/v5 v5.2.2 // indirect
+10
View File
@@ -50,6 +50,9 @@ type Settings struct {
Redirect bool
XSS bool
Framework bool
Crawl bool
CrawlDepth int
Passive bool
Modules string // Comma-separated list of module IDs to run
ModuleTags string // Run modules matching these tags
AllModules bool // Run all loaded modules
@@ -65,6 +68,10 @@ type Settings struct {
// "negative WaitGroup counter"; clamp the parsed value up to this.
const minThreads = 1
// defaultCrawlDepth bounds how far the spider recurses by default; deep enough
// to find linked pages without crawling an entire site.
const defaultCrawlDepth = 2
const (
Nil goflags.EnumVariable = iota
@@ -114,6 +121,9 @@ func Parse() *Settings {
flagSet.BoolVar(&settings.Redirect, "redirect", false, "Enable open redirect probe"),
flagSet.BoolVar(&settings.XSS, "xss", false, "Enable reflected XSS probe"),
flagSet.BoolVar(&settings.Framework, "framework", false, "Enable framework detection"),
flagSet.BoolVar(&settings.Crawl, "crawl", false, "Enable web crawling (spider same-host links/scripts/forms)"),
flagSet.IntVar(&settings.CrawlDepth, "crawl-depth", defaultCrawlDepth, "Max crawl recursion depth"),
flagSet.BoolVar(&settings.Passive, "passive", false, "Enable passive subdomain/url discovery (zero traffic to target)"),
)
flagSet.CreateGroup("runtime", "Runtime",
+137
View File
@@ -0,0 +1,137 @@
/*
·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━·
: :
: █▀ █ █▀▀ · Blazing-fast pentesting suite :
: ▄█ █ █▀ · BSD 3-Clause License :
: :
: (c) 2022-2026 vmfunc, xyzeva, :
: lunchcat alumni & contributors :
: :
·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━·
*/
package scan
import (
"fmt"
"net/url"
"sort"
"sync"
"time"
"github.com/gocolly/colly/v2"
"github.com/dropalldatabases/sif/internal/httpx"
"github.com/dropalldatabases/sif/internal/logger"
"github.com/dropalldatabases/sif/internal/output"
)
// CrawlResult holds the deduped set of urls discovered by the spider.
type CrawlResult struct {
URLs []string `json:"urls"`
}
func (r *CrawlResult) ResultType() string { return "crawl" }
// compile-time check so a result-type drift fails the build, not a run.
var _ ScanResult = (*CrawlResult)(nil)
// Crawl spiders the target up to depth, following same-host links/scripts/forms.
// all traffic flows through the shared httpx client so proxy/headers/rate-limit
// apply, and robots.txt is respected (colly honors it by default).
func Crawl(targetURL string, depth int, timeout time.Duration, logdir string) (*CrawlResult, error) {
log := output.Module("CRAWL")
log.Start()
sanitizedURL := stripScheme(targetURL)
if logdir != "" {
if err := logger.WriteHeader(sanitizedURL, logdir, "web crawl"); err != nil {
log.Error("error creating log file: %v", err)
return nil, fmt.Errorf("create crawl log: %w", err)
}
}
// the host bounds the crawl; without it colly would wander the whole web.
parsed, err := url.Parse(targetURL)
if err != nil {
return nil, fmt.Errorf("parse target url %q: %w", targetURL, err)
}
host := parsed.Hostname()
if host == "" {
return nil, fmt.Errorf("target url %q has no host", targetURL)
}
collector := colly.NewCollector(
colly.MaxDepth(depth),
colly.AllowedDomains(host),
)
// reuse the shared client so proxy/cookie/-H/rate-limit are honored and the
// configured timeout applies to every fetch, robots.txt included.
collector.SetClient(httpx.Client(timeout))
// dedupe across the concurrent callbacks colly may fire.
var mu sync.Mutex
seen := make(map[string]struct{})
record := func(raw string) {
if raw == "" {
return
}
// keep the result set scoped to the target host; off-host assets
// (cdns, third-party links) are noise for an in-scope crawl.
if u, err := url.Parse(raw); err != nil || u.Hostname() != host {
return
}
mu.Lock()
if _, ok := seen[raw]; !ok {
seen[raw] = struct{}{}
log.Success("found: %s", output.Highlight.Render(raw))
if logdir != "" {
_ = logger.Write(sanitizedURL, logdir, raw+"\n")
}
}
mu.Unlock()
}
// links drive recursion; scripts/forms are recorded but not followed.
collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Request.AbsoluteURL(e.Attr("href"))
record(link)
// Visit enforces AllowedDomains/MaxDepth itself, so off-host or
// too-deep links are dropped without us re-checking.
_ = e.Request.Visit(link)
})
collector.OnHTML("script[src]", func(e *colly.HTMLElement) {
record(e.Request.AbsoluteURL(e.Attr("src")))
})
collector.OnHTML("form[action]", func(e *colly.HTMLElement) {
record(e.Request.AbsoluteURL(e.Attr("action")))
})
collector.OnError(func(_ *colly.Response, e error) {
// a single bad page shouldn't abort the crawl; note it and move on.
log.Warn("crawl error: %v", e)
})
if err := collector.Visit(targetURL); err != nil {
log.Error("crawl failed: %v", err)
return nil, fmt.Errorf("visit %q: %w", targetURL, err)
}
collector.Wait()
result := &CrawlResult{URLs: sortedKeys(seen)}
log.Complete(len(result.URLs), "urls")
return result, nil
}
// sortedKeys returns the map keys in a stable order so output is deterministic.
func sortedKeys(set map[string]struct{}) []string {
keys := make([]string, 0, len(set))
for k := range set {
keys = append(keys, k)
}
sort.Strings(keys)
return keys
}
+158
View File
@@ -0,0 +1,158 @@
/*
·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━·
: :
: █▀ █ █▀▀ · Blazing-fast pentesting suite :
: ▄█ █ █▀ · BSD 3-Clause License :
: :
: (c) 2022-2026 vmfunc, xyzeva, :
: lunchcat alumni & contributors :
: :
·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━·
*/
package scan
import (
"net/http"
"net/http/httptest"
"testing"
"time"
)
// crawlSite serves a small link graph:
//
// / -> links /a and an off-host page; references script.js, form action /submit
// /a -> links /b
// /b -> links /c (only reachable at depth 3)
// /c -> leaf
func crawlSite(t *testing.T) *httptest.Server {
t.Helper()
mux := http.NewServeMux()
// no robots restrictions; colly fetches this before crawling.
mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusNotFound)
})
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/" {
http.NotFound(w, r)
return
}
_, _ = w.Write([]byte(`<html><body>
<a href="/a">a</a>
<a href="https://off-host.example/x">off</a>
<script src="/script.js"></script>
<form action="/submit"></form>
</body></html>`))
})
mux.HandleFunc("/a", func(w http.ResponseWriter, _ *http.Request) {
_, _ = w.Write([]byte(`<a href="/b">b</a>`))
})
mux.HandleFunc("/b", func(w http.ResponseWriter, _ *http.Request) {
_, _ = w.Write([]byte(`<a href="/c">c</a>`))
})
mux.HandleFunc("/c", func(w http.ResponseWriter, _ *http.Request) {
_, _ = w.Write([]byte(`leaf`))
})
srv := httptest.NewServer(mux)
t.Cleanup(srv.Close)
return srv
}
func urlsContain(urls []string, want string) bool {
for i := 0; i < len(urls); i++ {
if urls[i] == want {
return true
}
}
return false
}
func TestCrawl_FindsLinkedPagesAndAssets(t *testing.T) {
srv := crawlSite(t)
result, err := Crawl(srv.URL, 3, 5*time.Second, "")
if err != nil {
t.Fatalf("Crawl: %v", err)
}
// links, scripts and forms must all be recorded, resolved to absolute urls.
wants := []string{
srv.URL + "/a",
srv.URL + "/b",
srv.URL + "/c",
srv.URL + "/script.js",
srv.URL + "/submit",
}
for _, w := range wants {
if !urlsContain(result.URLs, w) {
t.Errorf("expected crawl to find %q, got %v", w, result.URLs)
}
}
// AllowedDomains must keep the off-host link out of the result set.
if urlsContain(result.URLs, "https://off-host.example/x") {
t.Errorf("off-host link should be excluded, got %v", result.URLs)
}
}
func TestCrawl_RespectsDepth(t *testing.T) {
srv := crawlSite(t)
// depth 1: only links found on the root page (/a, /script.js, /submit) are
// recorded; /b lives one hop deeper and must not appear.
result, err := Crawl(srv.URL, 1, 5*time.Second, "")
if err != nil {
t.Fatalf("Crawl: %v", err)
}
if !urlsContain(result.URLs, srv.URL+"/a") {
t.Errorf("depth 1 should find /a, got %v", result.URLs)
}
if urlsContain(result.URLs, srv.URL+"/b") {
t.Errorf("depth 1 must not reach /b, got %v", result.URLs)
}
if urlsContain(result.URLs, srv.URL+"/c") {
t.Errorf("depth 1 must not reach /c, got %v", result.URLs)
}
}
func TestCrawl_Dedupes(t *testing.T) {
// a page that links the same target twice must yield a single entry.
mux := http.NewServeMux()
mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusNotFound)
})
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/dup" {
_, _ = w.Write([]byte(`leaf`))
return
}
_, _ = w.Write([]byte(`<a href="/dup">1</a><a href="/dup">2</a>`))
})
srv := httptest.NewServer(mux)
defer srv.Close()
result, err := Crawl(srv.URL, 2, 5*time.Second, "")
if err != nil {
t.Fatalf("Crawl: %v", err)
}
count := 0
for _, u := range result.URLs {
if u == srv.URL+"/dup" {
count++
}
}
if count != 1 {
t.Errorf("expected /dup once after dedupe, got %d in %v", count, result.URLs)
}
}
func TestCrawl_ResultType(t *testing.T) {
r := &CrawlResult{}
if r.ResultType() != "crawl" {
t.Errorf("ResultType = %q, want crawl", r.ResultType())
}
}
+266
View File
@@ -0,0 +1,266 @@
/*
·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━·
: :
: █▀ █ █▀▀ · Blazing-fast pentesting suite :
: ▄█ █ █▀ · BSD 3-Clause License :
: :
: (c) 2022-2026 vmfunc, xyzeva, :
: lunchcat alumni & contributors :
: :
·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━·
*/
package scan
import (
"bufio"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"github.com/dropalldatabases/sif/internal/httpx"
"github.com/dropalldatabases/sif/internal/logger"
"github.com/dropalldatabases/sif/internal/output"
)
// source base urls are vars so tests can repoint them at local fixtures. they
// carry a trailing %s for the domain (or query) each source expects.
var (
crtshBaseURL = "https://crt.sh/?q=%%25.%s&output=json"
certspotterBaseURL = "https://api.certspotter.com/v1/issuances?domain=%s&include_subdomains=true&expand=dns_names"
waybackBaseURL = "http://web.archive.org/cdx/search/cdx?url=*.%s/*&output=text&fl=original&collapse=urlkey"
)
// cap the response we read from any one source so a hostile/huge feed can't
// exhaust memory.
const passiveMaxBytes = 25 * 1024 * 1024
// PassiveResult holds passively-gathered subdomains and historical urls. all
// data comes from third-party feeds; the target itself sees zero traffic.
type PassiveResult struct {
Subdomains []string `json:"subdomains"`
URLs []string `json:"urls"`
}
func (r *PassiveResult) ResultType() string { return "passive" }
// compile-time check so a result-type drift fails the build, not a run.
var _ ScanResult = (*PassiveResult)(nil)
// crtshEntry is one certificate record from crt.sh; name_value may itself hold
// several newline-separated names.
type crtshEntry struct {
NameValue string `json:"name_value"`
}
// certspotterEntry is one issuance from certspotter, expanded to dns names.
type certspotterEntry struct {
DNSNames []string `json:"dns_names"`
}
// Passive performs keyless passive recon: subdomains from certificate
// transparency feeds plus historical urls from the wayback machine. each source
// fails independently so one feed being down doesn't sink the rest.
func Passive(targetURL string, timeout time.Duration, logdir string) (*PassiveResult, error) {
log := output.Module("PASSIVE")
log.Start()
parsed, err := url.Parse(targetURL)
if err != nil {
return nil, fmt.Errorf("parse target url %q: %w", targetURL, err)
}
domain := parsed.Hostname()
if domain == "" {
return nil, fmt.Errorf("target url %q has no host", targetURL)
}
sanitizedURL := stripScheme(targetURL)
if logdir != "" {
if err := logger.WriteHeader(sanitizedURL, logdir, "passive recon"); err != nil {
log.Error("error creating log file: %v", err)
return nil, fmt.Errorf("create passive log: %w", err)
}
}
client := httpx.Client(timeout)
ctx := context.TODO()
subSet := make(map[string]struct{})
urlSet := make(map[string]struct{})
// crt.sh certificate transparency
if subs, err := fetchCrtsh(ctx, client, domain); err != nil {
log.Warn("crt.sh failed: %v", err)
} else {
addAll(subSet, subs)
}
// certspotter certificate transparency
if subs, err := fetchCertspotter(ctx, client, domain); err != nil {
log.Warn("certspotter failed: %v", err)
} else {
addAll(subSet, subs)
}
// wayback machine historical urls
if urls, err := fetchWayback(ctx, client, domain); err != nil {
log.Warn("wayback failed: %v", err)
} else {
addAll(urlSet, urls)
}
result := &PassiveResult{
Subdomains: sortedKeys(subSet),
URLs: sortedKeys(urlSet),
}
logPassiveResults(log, sanitizedURL, logdir, result)
log.Complete(len(result.Subdomains)+len(result.URLs), "discovered")
return result, nil
}
// fetchCrtsh pulls subdomains from crt.sh's certificate transparency json.
func fetchCrtsh(ctx context.Context, client *http.Client, domain string) ([]string, error) {
body, err := passiveGET(ctx, client, fmt.Sprintf(crtshBaseURL, domain))
if err != nil {
return nil, err
}
var entries []crtshEntry
if err := json.Unmarshal(body, &entries); err != nil {
return nil, fmt.Errorf("parse crt.sh json: %w", err)
}
var names []string
for i := 0; i < len(entries); i++ {
// name_value can pack several names separated by newlines.
for _, name := range strings.Split(entries[i].NameValue, "\n") {
if host := normalizeHost(name); host != "" {
names = append(names, host)
}
}
}
return names, nil
}
// fetchCertspotter pulls subdomains from certspotter's keyless issuances feed.
func fetchCertspotter(ctx context.Context, client *http.Client, domain string) ([]string, error) {
body, err := passiveGET(ctx, client, fmt.Sprintf(certspotterBaseURL, domain))
if err != nil {
return nil, err
}
var entries []certspotterEntry
if err := json.Unmarshal(body, &entries); err != nil {
return nil, fmt.Errorf("parse certspotter json: %w", err)
}
var names []string
for i := 0; i < len(entries); i++ {
for _, name := range entries[i].DNSNames {
if host := normalizeHost(name); host != "" {
names = append(names, host)
}
}
}
return names, nil
}
// fetchWayback pulls historical urls from the wayback machine cdx index, which
// returns one original url per line.
func fetchWayback(ctx context.Context, client *http.Client, domain string) ([]string, error) {
body, err := passiveGET(ctx, client, fmt.Sprintf(waybackBaseURL, domain))
if err != nil {
return nil, err
}
var urls []string
scanner := bufio.NewScanner(strings.NewReader(string(body)))
// historical urls can be long; give the scanner a generous line buffer.
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line != "" {
urls = append(urls, line)
}
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("read wayback lines: %w", err)
}
return urls, nil
}
// passiveGET performs a bounded GET against a passive source. non-200 responses
// are treated as a source failure so the caller can skip it.
func passiveGET(ctx context.Context, client *http.Client, reqURL string) ([]byte, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, http.NoBody)
if err != nil {
return nil, fmt.Errorf("create request: %w", err)
}
req.Header.Set("Accept", "application/json")
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status %d", resp.StatusCode)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, passiveMaxBytes))
if err != nil {
return nil, fmt.Errorf("read response: %w", err)
}
return body, nil
}
// normalizeHost lowercases a name and strips a leading wildcard label so
// "*.example.com" and "EXAMPLE.com" collapse to one canonical host.
func normalizeHost(name string) string {
host := strings.ToLower(strings.TrimSpace(name))
host = strings.TrimPrefix(host, "*.")
return host
}
// addAll inserts every value into the dedupe set.
func addAll(set map[string]struct{}, values []string) {
for _, v := range values {
set[v] = struct{}{}
}
}
func logPassiveResults(log *output.ModuleLogger, sanitizedURL, logdir string, result *PassiveResult) {
for _, sub := range result.Subdomains {
log.Success("subdomain: %s", output.Highlight.Render(sub))
}
for _, u := range result.URLs {
log.Info("url: %s", u)
}
if logdir == "" {
return
}
var sb strings.Builder
if len(result.Subdomains) > 0 {
sb.WriteString(fmt.Sprintf("Subdomains (%d):\n", len(result.Subdomains)))
for _, sub := range result.Subdomains {
sb.WriteString(" " + sub + "\n")
}
}
if len(result.URLs) > 0 {
sb.WriteString(fmt.Sprintf("\nHistorical URLs (%d):\n", len(result.URLs)))
for _, u := range result.URLs {
sb.WriteString(" " + u + "\n")
}
}
_ = logger.Write(sanitizedURL, logdir, sb.String())
}
+163
View File
@@ -0,0 +1,163 @@
/*
·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━·
: :
: █▀ █ █▀▀ · Blazing-fast pentesting suite :
: ▄█ █ █▀ · BSD 3-Clause License :
: :
: (c) 2022-2026 vmfunc, xyzeva, :
: lunchcat alumni & contributors :
: :
·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━·
*/
package scan
import (
"net/http"
"net/http/httptest"
"testing"
"time"
)
// sample feed payloads. crt.sh packs several names per name_value (newline
// separated) and emits wildcards; certspotter returns expanded dns_names.
const (
crtshFixture = `[
{"name_value": "www.example.com\n*.example.com"},
{"name_value": "api.example.com"},
{"name_value": "WWW.example.com"}
]`
certspotterFixture = `[
{"dns_names": ["mail.example.com", "api.example.com"]},
{"dns_names": ["*.example.com"]}
]`
waybackFixture = "http://example.com/\n" +
"http://example.com/login\n" +
"http://example.com/login\n" +
"\n" +
"http://example.com/admin\n"
)
// fixtureServer serves each passive source on its own path and repoints the
// package base-url vars at it. the vars are restored on cleanup.
func fixtureServer(t *testing.T, crtsh, certspotter, wayback string) *httptest.Server {
t.Helper()
mux := http.NewServeMux()
mux.HandleFunc("/crtsh", func(w http.ResponseWriter, _ *http.Request) {
_, _ = w.Write([]byte(crtsh))
})
mux.HandleFunc("/certspotter", func(w http.ResponseWriter, _ *http.Request) {
_, _ = w.Write([]byte(certspotter))
})
mux.HandleFunc("/wayback", func(w http.ResponseWriter, _ *http.Request) {
_, _ = w.Write([]byte(wayback))
})
srv := httptest.NewServer(mux)
t.Cleanup(srv.Close)
origCrtsh, origCertspotter, origWayback := crtshBaseURL, certspotterBaseURL, waybackBaseURL
// %s still consumes the domain so the production formatting path is exercised.
crtshBaseURL = srv.URL + "/crtsh?q=%s"
certspotterBaseURL = srv.URL + "/certspotter?domain=%s"
waybackBaseURL = srv.URL + "/wayback?url=%s"
t.Cleanup(func() {
crtshBaseURL, certspotterBaseURL, waybackBaseURL = origCrtsh, origCertspotter, origWayback
})
return srv
}
func TestPassive_ParsesAndDedupes(t *testing.T) {
fixtureServer(t, crtshFixture, certspotterFixture, waybackFixture)
result, err := Passive("https://example.com", 5*time.Second, "")
if err != nil {
t.Fatalf("Passive: %v", err)
}
// wildcards stripped, case-folded, and merged across both ct feeds.
wantSubs := map[string]bool{
"www.example.com": false,
"api.example.com": false,
"mail.example.com": false,
"example.com": false, // from "*.example.com"
}
for _, s := range result.Subdomains {
if _, ok := wantSubs[s]; !ok {
t.Errorf("unexpected subdomain %q", s)
continue
}
wantSubs[s] = true
}
for s, seen := range wantSubs {
if !seen {
t.Errorf("missing subdomain %q in %v", s, result.Subdomains)
}
}
if len(result.Subdomains) != len(wantSubs) {
t.Errorf("expected %d deduped subdomains, got %d: %v", len(wantSubs), len(result.Subdomains), result.Subdomains)
}
// wayback: blank line dropped, duplicate /login collapsed.
wantURLs := map[string]bool{
"http://example.com/": false,
"http://example.com/login": false,
"http://example.com/admin": false,
}
for _, u := range result.URLs {
if _, ok := wantURLs[u]; !ok {
t.Errorf("unexpected url %q", u)
continue
}
wantURLs[u] = true
}
if len(result.URLs) != len(wantURLs) {
t.Errorf("expected %d deduped urls, got %d: %v", len(wantURLs), len(result.URLs), result.URLs)
}
}
func TestPassive_SourceFailureIsIsolated(t *testing.T) {
// crt.sh serves garbage that fails to parse; the other feeds must still
// produce results.
fixtureServer(t, "not json", certspotterFixture, waybackFixture)
result, err := Passive("https://example.com", 5*time.Second, "")
if err != nil {
t.Fatalf("Passive should not fail when one source is down: %v", err)
}
if len(result.Subdomains) == 0 {
t.Error("expected certspotter subdomains despite crt.sh failure")
}
if len(result.URLs) == 0 {
t.Error("expected wayback urls despite crt.sh failure")
}
if urlsContain(result.Subdomains, "www.example.com") {
t.Error("crt.sh-only subdomain leaked despite parse failure")
}
}
func TestPassive_ResultType(t *testing.T) {
r := &PassiveResult{}
if r.ResultType() != "passive" {
t.Errorf("ResultType = %q, want passive", r.ResultType())
}
}
func TestNormalizeHost(t *testing.T) {
tests := []struct {
in string
want string
}{
{"www.example.com", "www.example.com"},
{"*.example.com", "example.com"},
{" WWW.Example.COM ", "www.example.com"},
{"", ""},
}
for _, tt := range tests {
if got := normalizeHost(tt.in); got != tt.want {
t.Errorf("normalizeHost(%q) = %q, want %q", tt.in, got, tt.want)
}
}
}
+9
View File
@@ -98,6 +98,15 @@ reflected xss probe.
.B \-framework
framework detection with cve lookup.
.TP
.B \-crawl
web crawler; spiders same\-host links, scripts and forms, respecting robots.txt.
.TP
.BR \-crawl\-depth " \fIn\fR"
max crawl recursion depth (default 2).
.TP
.B \-passive
passive subdomain and historical url discovery from third\-party feeds (zero traffic to the target).
.TP
.B \-noscan
skip the base url scan (robots.txt, etc).
.SH OPTIONS
+20
View File
@@ -421,6 +421,26 @@ func (app *App) Run() error {
}
}
if app.settings.Crawl {
result, err := scan.Crawl(url, app.settings.CrawlDepth, app.settings.Timeout, app.settings.LogDir)
if err != nil {
log.Errorf("Error while running web crawl: %s", err)
} else if result != nil {
moduleResults = append(moduleResults, NewModuleResult(result))
scansRun = append(scansRun, "Crawl")
}
}
if app.settings.Passive {
result, err := scan.Passive(url, app.settings.Timeout, app.settings.LogDir)
if err != nil {
log.Errorf("Error while running passive discovery: %s", err)
} else if result != nil {
moduleResults = append(moduleResults, NewModuleResult(result))
scansRun = append(scansRun, "Passive")
}
}
// Load and run modules
if app.settings.AllModules || app.settings.Modules != "" || app.settings.ModuleTags != "" {
loader, err := modules.NewLoader()