mirror of
https://github.com/lunchcat/sif.git
synced 2026-06-12 19:11:25 -07:00
dbe79c495e
-crawl spiders same-host links/scripts/forms through the shared httpx client so proxy/headers/rate-limit and robots.txt are honored, bounded by -crawl-depth. -passive pulls subdomains from keyless ct feeds (crt.sh, certspotter) and historical urls from wayback, each source isolated so one feed being down doesn't sink the rest and the target sees no traffic.
159 lines
4.8 KiB
Go
159 lines
4.8 KiB
Go
/*
|
|
·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━·
|
|
: :
|
|
: █▀ █ █▀▀ · Blazing-fast pentesting suite :
|
|
: ▄█ █ █▀ · BSD 3-Clause License :
|
|
: :
|
|
: (c) 2022-2026 vmfunc, xyzeva, :
|
|
: lunchcat alumni & contributors :
|
|
: :
|
|
·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━·
|
|
*/
|
|
|
|
package scan
|
|
|
|
import (
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
// crawlSite serves a small link graph:
|
|
//
|
|
// / -> links /a and an off-host page; references script.js, form action /submit
|
|
// /a -> links /b
|
|
// /b -> links /c (only reachable at depth 3)
|
|
// /c -> leaf
|
|
func crawlSite(t *testing.T) *httptest.Server {
|
|
t.Helper()
|
|
|
|
mux := http.NewServeMux()
|
|
// no robots restrictions; colly fetches this before crawling.
|
|
mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.WriteHeader(http.StatusNotFound)
|
|
})
|
|
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
|
|
if r.URL.Path != "/" {
|
|
http.NotFound(w, r)
|
|
return
|
|
}
|
|
_, _ = w.Write([]byte(`<html><body>
|
|
<a href="/a">a</a>
|
|
<a href="https://off-host.example/x">off</a>
|
|
<script src="/script.js"></script>
|
|
<form action="/submit"></form>
|
|
</body></html>`))
|
|
})
|
|
mux.HandleFunc("/a", func(w http.ResponseWriter, _ *http.Request) {
|
|
_, _ = w.Write([]byte(`<a href="/b">b</a>`))
|
|
})
|
|
mux.HandleFunc("/b", func(w http.ResponseWriter, _ *http.Request) {
|
|
_, _ = w.Write([]byte(`<a href="/c">c</a>`))
|
|
})
|
|
mux.HandleFunc("/c", func(w http.ResponseWriter, _ *http.Request) {
|
|
_, _ = w.Write([]byte(`leaf`))
|
|
})
|
|
|
|
srv := httptest.NewServer(mux)
|
|
t.Cleanup(srv.Close)
|
|
return srv
|
|
}
|
|
|
|
func urlsContain(urls []string, want string) bool {
|
|
for i := 0; i < len(urls); i++ {
|
|
if urls[i] == want {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func TestCrawl_FindsLinkedPagesAndAssets(t *testing.T) {
|
|
srv := crawlSite(t)
|
|
|
|
result, err := Crawl(srv.URL, 3, 5*time.Second, "")
|
|
if err != nil {
|
|
t.Fatalf("Crawl: %v", err)
|
|
}
|
|
|
|
// links, scripts and forms must all be recorded, resolved to absolute urls.
|
|
wants := []string{
|
|
srv.URL + "/a",
|
|
srv.URL + "/b",
|
|
srv.URL + "/c",
|
|
srv.URL + "/script.js",
|
|
srv.URL + "/submit",
|
|
}
|
|
for _, w := range wants {
|
|
if !urlsContain(result.URLs, w) {
|
|
t.Errorf("expected crawl to find %q, got %v", w, result.URLs)
|
|
}
|
|
}
|
|
|
|
// AllowedDomains must keep the off-host link out of the result set.
|
|
if urlsContain(result.URLs, "https://off-host.example/x") {
|
|
t.Errorf("off-host link should be excluded, got %v", result.URLs)
|
|
}
|
|
}
|
|
|
|
func TestCrawl_RespectsDepth(t *testing.T) {
|
|
srv := crawlSite(t)
|
|
|
|
// depth 1: only links found on the root page (/a, /script.js, /submit) are
|
|
// recorded; /b lives one hop deeper and must not appear.
|
|
result, err := Crawl(srv.URL, 1, 5*time.Second, "")
|
|
if err != nil {
|
|
t.Fatalf("Crawl: %v", err)
|
|
}
|
|
|
|
if !urlsContain(result.URLs, srv.URL+"/a") {
|
|
t.Errorf("depth 1 should find /a, got %v", result.URLs)
|
|
}
|
|
if urlsContain(result.URLs, srv.URL+"/b") {
|
|
t.Errorf("depth 1 must not reach /b, got %v", result.URLs)
|
|
}
|
|
if urlsContain(result.URLs, srv.URL+"/c") {
|
|
t.Errorf("depth 1 must not reach /c, got %v", result.URLs)
|
|
}
|
|
}
|
|
|
|
func TestCrawl_Dedupes(t *testing.T) {
|
|
// a page that links the same target twice must yield a single entry.
|
|
mux := http.NewServeMux()
|
|
mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.WriteHeader(http.StatusNotFound)
|
|
})
|
|
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
|
|
if r.URL.Path == "/dup" {
|
|
_, _ = w.Write([]byte(`leaf`))
|
|
return
|
|
}
|
|
_, _ = w.Write([]byte(`<a href="/dup">1</a><a href="/dup">2</a>`))
|
|
})
|
|
srv := httptest.NewServer(mux)
|
|
defer srv.Close()
|
|
|
|
result, err := Crawl(srv.URL, 2, 5*time.Second, "")
|
|
if err != nil {
|
|
t.Fatalf("Crawl: %v", err)
|
|
}
|
|
|
|
count := 0
|
|
for _, u := range result.URLs {
|
|
if u == srv.URL+"/dup" {
|
|
count++
|
|
}
|
|
}
|
|
if count != 1 {
|
|
t.Errorf("expected /dup once after dedupe, got %d in %v", count, result.URLs)
|
|
}
|
|
}
|
|
|
|
func TestCrawl_ResultType(t *testing.T) {
|
|
r := &CrawlResult{}
|
|
if r.ResultType() != "crawl" {
|
|
t.Errorf("ResultType = %q, want crawl", r.ResultType())
|
|
}
|
|
}
|