From 5050900f297603ac782c36f275a26ed65b3e4f79 Mon Sep 17 00:00:00 2001 From: vmfunc Date: Wed, 10 Jun 2026 14:45:32 -0700 Subject: [PATCH] feat(dirlist): response filters, wildcard calibration and custom wordlists the old scanner surfaced every response that wasn't 404/403, so modern SPA catch-all 200s flooded the output and made -dirlist near-useless. add ffuf-style matching: - -mc/-fc/-fr and -fs/-fw filter by status, regex, body size and word count; bodies are read through a capped io.LimitReader so size/word counts are deterministic and memory stays flat. filters win over matches. - -ac auto-calibrates the soft-404 baseline from a few deterministic non-existent paths and drops responses matching that wildcard shape. - -w overrides the size switch with a local file or remote list (fetched through the shared client so proxy/rate-limit apply); -e appends extensions per word. size and words are added to DirectoryResult for the json output. --- README.md | 8 + docs/usage.md | 36 +++ internal/config/config.go | 16 ++ internal/scan/dirlist.go | 418 +++++++++++++++++++++++++++--- internal/scan/dirlist_test.go | 360 +++++++++++++++++++++++++ internal/scan/integration_test.go | 2 +- man/sif.1 | 24 ++ sif.go | 13 +- 8 files changed, 833 insertions(+), 44 deletions(-) create mode 100644 internal/scan/dirlist_test.go diff --git a/README.md b/README.md index 59359f3..5c2a158 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,14 @@ sif has a modular architecture. modules are defined in yaml and can be extended | flag | description | |------|-------------| | `-dirlist` | directory and file fuzzing (small/medium/large) | +| `-mc` | dirlist: match these status codes (comma list, e.g. 200,301) | +| `-fc` | dirlist: filter out these status codes (comma list) | +| `-fs` | dirlist: filter out responses of these body sizes (comma list) | +| `-fw` | dirlist: filter out responses with these word counts (comma list) | +| `-fr` | dirlist: filter out responses whose body matches this regex | +| `-ac` | dirlist: auto-calibrate the soft-404 wildcard baseline | +| `-w` | dirlist: custom wordlist (local file or url; overrides `-dirlist` size) | +| `-e` | dirlist: extensions appended to each word (comma list, e.g. php,bak,env) | | `-dnslist` | subdomain enumeration (small/medium/large) | | `-ports` | port scanning (common/full) | | `-nuclei` | vulnerability scanning with nuclei templates | diff --git a/docs/usage.md b/docs/usage.md index ae75048..55410ef 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -33,6 +33,42 @@ sizes: `small`, `medium`, `large` ./sif -u https://example.com -dirlist medium ``` +#### response filters + +modern apps serve a catch-all 200 for unknown routes, so a naive scan reports +every path. these ffuf-style filters cut the noise (a filter always wins over a +match): + +- `-mc ` - match only these status codes (comma list, e.g. `200,301`) +- `-fc ` - filter out these status codes +- `-fs ` - filter out responses of these body sizes +- `-fw ` - filter out responses with these word counts +- `-fr ` - filter out responses whose body matches this regex + +```bash +./sif -u https://example.com -dirlist medium -mc 200,301 -fs 1234 +``` + +#### wildcard calibration + +`-ac` probes a few paths that cannot exist, learns the soft-404 baseline +(status + size + words), and auto-drops any response matching it - so SPA +catch-all 200s stop flooding the output: + +```bash +./sif -u https://example.com -dirlist medium -ac +``` + +#### custom wordlists and extensions + +`-w ` overrides the size switch with your own list (local file or +remote url); `-e ` appends each extension to every word, keeping the bare +word too: + +```bash +./sif -u https://example.com -w /path/to/words.txt -e php,bak,env +``` + ### subdomain enumeration `-dnslist ` - enumerate subdomains diff --git a/internal/config/config.go b/internal/config/config.go index bdd2f95..95f683c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -21,6 +21,14 @@ import ( type Settings struct { Dirlist string + DirMatchCodes string // -mc dirlist: status codes to keep + DirFilterCodes string // -fc dirlist: status codes to drop + DirFilterSizes string // -fs dirlist: body sizes to drop + DirFilterWords string // -fw dirlist: word counts to drop + DirFilterRegex string // -fr dirlist: regex; body match drops response + DirCalibrate bool // -ac dirlist: auto-calibrate soft-404 baseline + DirWordlist string // -w dirlist: custom wordlist (file path or url) + DirExtensions string // -e dirlist: extensions appended to each word Dnslist string Debug bool LogDir string @@ -100,6 +108,14 @@ func Parse() *Settings { portScopes := goflags.AllowdTypes{"common": Common, "full": Full, "none": Nil} flagSet.CreateGroup("scans", "Scans", flagSet.EnumVar(&settings.Dirlist, "dirlist", Nil, "Directory fuzzing scan size (small/medium/large)", listSizes), + flagSet.StringVar(&settings.DirMatchCodes, "mc", "", "Dirlist: match these status codes (comma list, e.g. 200,301)"), + flagSet.StringVar(&settings.DirFilterCodes, "fc", "", "Dirlist: filter out these status codes (comma list)"), + flagSet.StringVar(&settings.DirFilterSizes, "fs", "", "Dirlist: filter out responses of these body sizes (comma list)"), + flagSet.StringVar(&settings.DirFilterWords, "fw", "", "Dirlist: filter out responses with these word counts (comma list)"), + flagSet.StringVar(&settings.DirFilterRegex, "fr", "", "Dirlist: filter out responses whose body matches this regex"), + flagSet.BoolVar(&settings.DirCalibrate, "ac", false, "Dirlist: auto-calibrate the soft-404 wildcard baseline"), + flagSet.StringVar(&settings.DirWordlist, "w", "", "Dirlist: custom wordlist (local file path or url; overrides -dirlist size)"), + flagSet.StringVar(&settings.DirExtensions, "e", "", "Dirlist: extensions appended to each word (comma list, e.g. php,bak,env)"), flagSet.EnumVar(&settings.Dnslist, "dnslist", Nil, "DNS fuzzing scan size (small/medium/large)", listSizes), flagSet.EnumVar(&settings.Ports, "ports", Nil, "Port scanning scope (common/full)", portScopes), flagSet.BoolVar(&settings.Dorking, "dork", false, "Enable Google dorking"), diff --git a/internal/scan/dirlist.go b/internal/scan/dirlist.go index f56824b..fcbc5f4 100644 --- a/internal/scan/dirlist.go +++ b/internal/scan/dirlist.go @@ -16,8 +16,12 @@ import ( "bufio" "context" "fmt" + "io" "net/http" + "os" + "regexp" "strconv" + "strings" "sync" "time" @@ -36,13 +40,342 @@ const ( bigFile = "directory-list-2.3-big.txt" ) +// dirlistBodyCap bounds how many bytes we read per response before computing +// size/word counts. modern apps stream large html; capping keeps memory flat +// and makes size/word matching deterministic against arbitrarily large bodies. +const dirlistBodyCap = 512 * 1024 + +// soft-404 calibration probes. we ask for a handful of deterministic paths that +// cannot exist, then treat any response shape they share as the wildcard +// baseline. deterministic (no rng) so the workflow stays reproducible. +const ( + calibrationProbes = 3 + calibrationPrefix = "/sif-cal-" +) + +// statusNotFound / statusForbidden are the historical default "not interesting" +// codes; they seed the filter set when no explicit -mc/-fc is given. +const ( + statusNotFound = 404 + statusForbidden = 403 +) + type DirectoryResult struct { Url string `json:"url"` StatusCode int `json:"status_code"` + Size int `json:"size"` + Words int `json:"words"` } -// Dirlist performs directory fuzzing on the target URL. -func Dirlist(size string, url string, timeout time.Duration, threads int, logdir string) ([]DirectoryResult, error) { +// DirlistOptions carries the ffuf-style matcher knobs. the zero value reproduces +// the legacy behavior (report everything that isn't 404/403), so callers that +// don't set anything keep the old defaults. +type DirlistOptions struct { + MatchCodes string // -mc comma list of status codes to keep + FilterCodes string // -fc comma list of status codes to drop + FilterSizes string // -fs comma list of body sizes to drop + FilterWords string // -fw comma list of word counts to drop + FilterRegex string // -fr regex; a body match drops the response + Calibrate bool // -ac auto-calibrate the soft-404 wildcard baseline + Wordlist string // -w local path or url; overrides the size switch + Extensions string // -e comma list appended to each word (php,bak,env) +} + +// responseMeta is the shape we match on: just enough of the response to decide +// keep/drop without holding the whole body. +type responseMeta struct { + status int + size int + words int +} + +// matcher decides whether a response is "interesting" using the same precedence +// as ffuf/feroxbuster: an explicit filter (-fc/-fs/-fw/-fr or a calibrated +// baseline) drops the response, otherwise the match-code set decides. +type matcher struct { + matchCodes map[int]struct{} + filterCodes map[int]struct{} + filterSizes map[int]struct{} + filterWords map[int]struct{} + filterRe *regexp.Regexp + baselines []responseMeta // calibrated soft-404 shapes to suppress +} + +// newMatcher builds the matcher from raw flag strings. when -mc is empty the +// match set is left nil, which Matches reads as "keep anything not explicitly +// filtered" - i.e. the legacy behavior minus the hardcoded 404/403, which move +// into the filter set instead. +func newMatcher(opts *DirlistOptions) (*matcher, error) { + m := &matcher{ + filterSizes: make(map[int]struct{}), + filterWords: make(map[int]struct{}), + } + + codes, err := parseIntSet(opts.MatchCodes) + if err != nil { + return nil, fmt.Errorf("parse -mc: %w", err) + } + m.matchCodes = codes + + m.filterCodes, err = parseIntSet(opts.FilterCodes) + if err != nil { + return nil, fmt.Errorf("parse -fc: %w", err) + } + // no explicit match set means we fall back to the historical "drop 404/403" + // behavior; encode it as filters so the rest of the logic is uniform. + if len(m.matchCodes) == 0 && len(m.filterCodes) == 0 { + m.filterCodes[statusNotFound] = struct{}{} + m.filterCodes[statusForbidden] = struct{}{} + } + + m.filterSizes, err = parseIntSet(opts.FilterSizes) + if err != nil { + return nil, fmt.Errorf("parse -fs: %w", err) + } + + m.filterWords, err = parseIntSet(opts.FilterWords) + if err != nil { + return nil, fmt.Errorf("parse -fw: %w", err) + } + + if opts.FilterRegex != "" { + re, err := regexp.Compile(opts.FilterRegex) + if err != nil { + return nil, fmt.Errorf("parse -fr: %w", err) + } + m.filterRe = re + } + + return m, nil +} + +// Matches reports whether the response should surface as a finding. filters win +// over matches: a calibrated baseline, an -fc/-fs/-fw hit, or an -fr body match +// always drops the response; otherwise the -mc set (when set) gates it. +func (m *matcher) Matches(meta responseMeta, body []byte) bool { + // a calibrated soft-404 shape is the same response the catch-all hands every + // bogus path, so drop anything that matches a baseline exactly. + for i := 0; i < len(m.baselines); i++ { + b := m.baselines[i] + if b.status == meta.status && b.size == meta.size && b.words == meta.words { + return false + } + } + + if _, drop := m.filterCodes[meta.status]; drop { + return false + } + if _, drop := m.filterSizes[meta.size]; drop { + return false + } + if _, drop := m.filterWords[meta.words]; drop { + return false + } + if m.filterRe != nil && m.filterRe.Match(body) { + return false + } + + // an explicit -mc set is allow-list semantics; without it we keep whatever + // survived the filters above. + if len(m.matchCodes) > 0 { + _, keep := m.matchCodes[meta.status] + return keep + } + + return true +} + +// parseIntSet turns a comma list like "200,301,500" into a set. empty input is a +// nil set, not an error, so unset flags are a no-op. +func parseIntSet(raw string) (map[int]struct{}, error) { + set := make(map[int]struct{}) + if raw == "" { + return set, nil + } + for _, part := range strings.Split(raw, ",") { + part = strings.TrimSpace(part) + if part == "" { + continue + } + n, err := strconv.Atoi(part) + if err != nil { + return nil, fmt.Errorf("invalid integer %q: %w", part, err) + } + set[n] = struct{}{} + } + return set, nil +} + +// readMeta drains the response (capped) and returns its match shape plus the +// body bytes the regex filter needs. it never returns the raw resp; callers +// close the body before this returns. +func readMeta(resp *http.Response) (responseMeta, []byte) { + body, err := io.ReadAll(io.LimitReader(resp.Body, dirlistBodyCap)) + if err != nil { + // a truncated/aborted body still has a usable status; treat what we read + // as the body rather than dropping the whole response. + charmlog.Debugf("dirlist: read body: %v", err) + } + return responseMeta{ + status: resp.StatusCode, + size: len(body), + words: countWords(body), + }, body +} + +// countWords counts whitespace-separated tokens; the cheap proxy ffuf uses to +// tell a soft-404 stub apart from a real page of the same byte size. +func countWords(body []byte) int { + return len(strings.Fields(string(body))) +} + +// expandWords appends each extension to every base word, keeping the bare word +// too. an empty extensions list returns the words unchanged. +func expandWords(words []string, extensions string) []string { + exts := splitExtensions(extensions) + if len(exts) == 0 { + return words + } + // each word yields itself plus one entry per extension. + expanded := make([]string, 0, len(words)*(len(exts)+1)) + for i := 0; i < len(words); i++ { + expanded = append(expanded, words[i]) + for j := 0; j < len(exts); j++ { + expanded = append(expanded, words[i]+"."+exts[j]) + } + } + return expanded +} + +// splitExtensions normalizes "php, .bak ,env" into ["php","bak","env"]; a +// leading dot is tolerated so both "php" and ".php" work. +func splitExtensions(raw string) []string { + if raw == "" { + return nil + } + parts := strings.Split(raw, ",") + exts := make([]string, 0, len(parts)) + for i := 0; i < len(parts); i++ { + ext := strings.TrimSpace(parts[i]) + ext = strings.TrimPrefix(ext, ".") + if ext != "" { + exts = append(exts, ext) + } + } + return exts +} + +// loadWordlist reads the fuzzing words. a custom -w overrides the size switch: +// an http(s) value is fetched through the shared client, anything else is a +// local file. with no -w it downloads the size-selected sif-runtime list. +func loadWordlist(opts *DirlistOptions, size string, client *http.Client) ([]string, error) { + if opts.Wordlist != "" { + if strings.HasPrefix(opts.Wordlist, "http://") || strings.HasPrefix(opts.Wordlist, "https://") { + return fetchWordlist(opts.Wordlist, client) + } + return readWordlistFile(opts.Wordlist) + } + + var file string + switch size { + case "small": + file = smallFile + case "medium": + file = mediumFile + case "large": + file = bigFile + default: + return nil, fmt.Errorf("unknown dirlist size %q", size) + } + return fetchWordlist(directoryURL+file, client) +} + +// fetchWordlist downloads a remote wordlist through the shared client so proxy +// and rate-limit settings apply to the fetch too. +func fetchWordlist(listURL string, client *http.Client) ([]string, error) { + req, err := http.NewRequestWithContext(context.TODO(), http.MethodGet, listURL, http.NoBody) + if err != nil { + return nil, fmt.Errorf("build wordlist request: %w", err) + } + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("download wordlist %q: %w", listURL, err) + } + defer resp.Body.Close() + return scanLines(resp.Body), nil +} + +// readWordlistFile loads a local wordlist file. +func readWordlistFile(path string) ([]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("open wordlist %q: %w", path, err) + } + defer f.Close() + return scanLines(f), nil +} + +// scanLines reads non-empty lines into a slice. +func scanLines(r io.Reader) []string { + var lines []string + scanner := bufio.NewScanner(r) + scanner.Split(bufio.ScanLines) + for scanner.Scan() { + line := scanner.Text() + if line != "" { + lines = append(lines, line) + } + } + return lines +} + +// calibrate probes a few paths that cannot exist and records the response shapes +// the catch-all hands them. those baselines feed the matcher so a soft-404 200 +// (the SPA wildcard) is suppressed before the real run. deterministic by design: +// the probe paths come from the loop index, never a random source. +func calibrate(m *matcher, baseURL string, client *http.Client) { + for i := 0; i < calibrationProbes; i++ { + probe := baseURL + calibrationPrefix + strconv.Itoa(i) + req, err := http.NewRequestWithContext(context.TODO(), http.MethodGet, probe, http.NoBody) + if err != nil { + charmlog.Debugf("dirlist: build calibration request: %v", err) + continue + } + resp, err := client.Do(req) + if err != nil { + charmlog.Debugf("dirlist: calibration probe %s: %v", probe, err) + continue + } + meta, _ := readMeta(resp) + resp.Body.Close() + + // a genuine hard 404 already gets filtered by code; only soft responses + // (a 200/30x catch-all) need a size/word baseline to suppress them. + if meta.status == statusNotFound { + continue + } + if !containsBaseline(m.baselines, meta) { + m.baselines = append(m.baselines, meta) + } + } +} + +// containsBaseline reports whether the shape is already recorded, so repeated +// probes returning the same soft-404 don't bloat the baseline set. +func containsBaseline(baselines []responseMeta, meta responseMeta) bool { + for i := 0; i < len(baselines); i++ { + if baselines[i] == meta { + return true + } + } + return false +} + +// Dirlist performs directory fuzzing on the target URL with ffuf-style response +// filtering, soft-404 calibration and custom wordlists. +// +//nolint:gocritic // opts is the scanner's stable public config; passed by value to match the other scanners' entry points. +func Dirlist(size string, url string, timeout time.Duration, threads int, logdir string, opts DirlistOptions) (DirectoryResults, error) { log := output.Module("DIRLIST") log.Start() @@ -55,35 +388,27 @@ func Dirlist(size string, url string, timeout time.Duration, threads int, logdir } } - var list string - switch size { - case "small": - list = directoryURL + smallFile - case "medium": - list = directoryURL + mediumFile - case "large": - list = directoryURL + bigFile + matcher, err := newMatcher(&opts) + if err != nil { + log.Error("invalid matcher flags: %v", err) + return nil, err } client := httpx.Client(timeout) - req, err := http.NewRequestWithContext(context.TODO(), http.MethodGet, list, http.NoBody) + directories, err := loadWordlist(&opts, size, client) if err != nil { - log.Error("Error creating directory list request: %s", err) + log.Error("Error loading directory list: %s", err) return nil, err } - resp, err := client.Do(req) - if err != nil { - log.Error("Error downloading directory list: %s", err) - return nil, err - } - defer resp.Body.Close() + directories = expandWords(directories, opts.Extensions) - var directories []string - scanner := bufio.NewScanner(resp.Body) - scanner.Split(bufio.ScanLines) - for scanner.Scan() { - directories = append(directories, scanner.Text()) + // -ac learns the wildcard baseline before the run so catch-all 200s drop. + if opts.Calibrate { + calibrate(matcher, url, client) + if len(matcher.baselines) > 0 { + log.Info("calibrated %d soft-404 baseline(s)", len(matcher.baselines)) + } } progress := output.NewProgress(len(directories), "fuzzing") @@ -92,7 +417,7 @@ func Dirlist(size string, url string, timeout time.Duration, threads int, logdir var mu sync.Mutex wg.Add(threads) - results := make([]DirectoryResult, 0, 64) + results := make(DirectoryResults, 0, 64) for thread := 0; thread < threads; thread++ { go func(thread int) { defer wg.Done() @@ -116,24 +441,35 @@ func Dirlist(size string, url string, timeout time.Duration, threads int, logdir continue } - if resp.StatusCode != 404 && resp.StatusCode != 403 { - progress.Pause() - log.Success("found: %s [%s]", output.Highlight.Render(directory), output.Status.Render(strconv.Itoa(resp.StatusCode))) - progress.Resume() - - if logdir != "" { - _ = logger.Write(sanitizedURL, logdir, fmt.Sprintf("%s [%s]\n", strconv.Itoa(resp.StatusCode), directory)) - } - - result := DirectoryResult{ - Url: resp.Request.URL.String(), - StatusCode: resp.StatusCode, - } - mu.Lock() - results = append(results, result) - mu.Unlock() - } + meta, body := readMeta(resp) + reqURL := resp.Request.URL.String() resp.Body.Close() + + if !matcher.Matches(meta, body) { + continue + } + + progress.Pause() + log.Success("found: %s [%s] (size=%d words=%d)", + output.Highlight.Render(directory), + output.Status.Render(strconv.Itoa(meta.status)), + meta.size, meta.words) + progress.Resume() + + if logdir != "" { + _ = logger.Write(sanitizedURL, logdir, + fmt.Sprintf("%s [%s] size=%d words=%d\n", strconv.Itoa(meta.status), directory, meta.size, meta.words)) + } + + result := DirectoryResult{ + Url: reqURL, + StatusCode: meta.status, + Size: meta.size, + Words: meta.words, + } + mu.Lock() + results = append(results, result) + mu.Unlock() } }(thread) } diff --git a/internal/scan/dirlist_test.go b/internal/scan/dirlist_test.go new file mode 100644 index 0000000..f0acff4 --- /dev/null +++ b/internal/scan/dirlist_test.go @@ -0,0 +1,360 @@ +/* +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +: : +: █▀ █ █▀▀ · Blazing-fast pentesting suite : +: ▄█ █ █▀ · BSD 3-Clause License : +: : +: (c) 2022-2026 vmfunc, xyzeva, : +: lunchcat alumni & contributors : +: : +·━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━· +*/ + +package scan + +import ( + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "reflect" + "strings" + "testing" + "time" +) + +func TestMatcher_Matches(t *testing.T) { + tests := []struct { + name string + opts DirlistOptions + meta responseMeta + body string + want bool + }{ + { + // default behavior: 404/403 drop, everything else surfaces + name: "default keeps 200", + opts: DirlistOptions{}, + meta: responseMeta{status: 200, size: 10, words: 2}, + want: true, + }, + { + name: "default drops 404", + opts: DirlistOptions{}, + meta: responseMeta{status: 404, size: 9, words: 1}, + want: false, + }, + { + name: "default drops 403", + opts: DirlistOptions{}, + meta: responseMeta{status: 403, size: 9, words: 1}, + want: false, + }, + { + // -mc is allow-list: only listed codes survive + name: "mc allowlist keeps listed", + opts: DirlistOptions{MatchCodes: "200,301"}, + meta: responseMeta{status: 301, size: 0, words: 0}, + want: true, + }, + { + name: "mc allowlist drops unlisted 200 already excluded", + opts: DirlistOptions{MatchCodes: "301"}, + meta: responseMeta{status: 200, size: 5, words: 1}, + want: false, + }, + { + name: "fc drops listed code", + opts: DirlistOptions{FilterCodes: "500"}, + meta: responseMeta{status: 500, size: 5, words: 1}, + want: false, + }, + { + // with an explicit -fc and no -mc, the implicit 404/403 filter is not + // added, so a 200 still surfaces + name: "fc leaves others", + opts: DirlistOptions{FilterCodes: "500"}, + meta: responseMeta{status: 200, size: 5, words: 1}, + want: true, + }, + { + name: "fs drops listed size", + opts: DirlistOptions{FilterSizes: "1024"}, + meta: responseMeta{status: 200, size: 1024, words: 50}, + want: false, + }, + { + name: "fw drops listed word count", + opts: DirlistOptions{FilterWords: "7"}, + meta: responseMeta{status: 200, size: 40, words: 7}, + want: false, + }, + { + name: "fr drops body match", + opts: DirlistOptions{FilterRegex: "not found"}, + meta: responseMeta{status: 200, size: 9, words: 2}, + body: "page not found", + want: false, + }, + { + name: "fr keeps non-match", + opts: DirlistOptions{FilterRegex: "not found"}, + meta: responseMeta{status: 200, size: 5, words: 1}, + body: "welcome", + want: true, + }, + { + // filter precedence: -mc would keep it, but a size filter drops it + name: "filter wins over match", + opts: DirlistOptions{MatchCodes: "200", FilterSizes: "12"}, + meta: responseMeta{status: 200, size: 12, words: 3}, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + m, err := newMatcher(&tt.opts) + if err != nil { + t.Fatalf("newMatcher: %v", err) + } + if got := m.Matches(tt.meta, []byte(tt.body)); got != tt.want { + t.Errorf("Matches(%+v, %q) = %v, want %v", tt.meta, tt.body, got, tt.want) + } + }) + } +} + +func TestMatcher_BaselineSuppresses(t *testing.T) { + m, err := newMatcher(&DirlistOptions{}) + if err != nil { + t.Fatalf("newMatcher: %v", err) + } + // a calibrated soft-404 shape drops an identical response + m.baselines = []responseMeta{{status: 200, size: 42, words: 5}} + + soft := responseMeta{status: 200, size: 42, words: 5} + if m.Matches(soft, nil) { + t.Error("baseline-matching response should be suppressed") + } + // a real page with a different size must still surface + livePage := responseMeta{status: 200, size: 99, words: 12} + if !m.Matches(livePage, nil) { + t.Error("distinct response should not be suppressed by baseline") + } +} + +func TestNewMatcher_InvalidFlags(t *testing.T) { + tests := []struct { + name string + opts DirlistOptions + }{ + {"bad mc", DirlistOptions{MatchCodes: "abc"}}, + {"bad fc", DirlistOptions{FilterCodes: "20x"}}, + {"bad fs", DirlistOptions{FilterSizes: "big"}}, + {"bad fw", DirlistOptions{FilterWords: "-"}}, + {"bad regex", DirlistOptions{FilterRegex: "("}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if _, err := newMatcher(&tt.opts); err == nil { + t.Errorf("newMatcher(%+v) expected error, got nil", tt.opts) + } + }) + } +} + +func TestExpandWords(t *testing.T) { + tests := []struct { + name string + words []string + exts string + want []string + }{ + { + name: "no extensions unchanged", + words: []string{"admin", "login"}, + exts: "", + want: []string{"admin", "login"}, + }, + { + name: "appends each extension and keeps bare", + words: []string{"config"}, + exts: "php,bak,env", + want: []string{"config", "config.php", "config.bak", "config.env"}, + }, + { + name: "tolerates leading dot and spaces", + words: []string{"db"}, + exts: " .sql , bak ", + want: []string{"db", "db.sql", "db.bak"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := expandWords(tt.words, tt.exts) + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("expandWords(%v, %q) = %v, want %v", tt.words, tt.exts, got, tt.want) + } + }) + } +} + +// softWildcardApp serves a couple of real paths and a catch-all soft-404: every +// unknown path returns a fixed 200 body, the SPA pattern that floods dirlist. +func softWildcardApp() *httptest.Server { + const softBody = "app shell - route handled client side" + mux := http.NewServeMux() + mux.HandleFunc("/admin", func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte("admin control panel dashboard here")) + }) + mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte("please sign in with your account credentials now")) + }) + // catch-all: anything else gets the identical soft-404 shell + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/admin" || r.URL.Path == "/login" { + return + } + w.Write([]byte(softBody)) + }) + return httptest.NewServer(mux) +} + +func TestDirlist_CalibrationSuppressesWildcard(t *testing.T) { + srv := softWildcardApp() + defer srv.Close() + + // the wordlist mixes the two real paths with several bogus ones the catch-all + // answers with the soft-404 shell. + dir := t.TempDir() + wordlist := filepath.Join(dir, "words.txt") + if err := os.WriteFile(wordlist, []byte("admin\nlogin\nnope\nbogus\nmissing\n"), 0o600); err != nil { + t.Fatalf("write wordlist: %v", err) + } + + // without calibration every bogus path is a soft-404 200 and floods output + noAC, err := Dirlist("small", srv.URL, 5*time.Second, 3, "", DirlistOptions{Wordlist: wordlist}) + if err != nil { + t.Fatalf("Dirlist (no -ac): %v", err) + } + if len(noAC) < 5 { + t.Fatalf("expected the wildcard to flood all 5 paths without -ac, got %d", len(noAC)) + } + + // with -ac the soft-404 baseline is learned and the bogus paths drop + withAC, err := Dirlist("small", srv.URL, 5*time.Second, 3, "", DirlistOptions{ + Wordlist: wordlist, + Calibrate: true, + }) + if err != nil { + t.Fatalf("Dirlist (-ac): %v", err) + } + + got := pathSet(withAC) + if !has(got, "/admin") || !has(got, "/login") { + t.Errorf("real paths admin/login must still surface with -ac, got %v", sortedKeys(got)) + } + for _, bogus := range []string{"/nope", "/bogus", "/missing"} { + if has(got, bogus) { + t.Errorf("soft-404 path %s should be suppressed by -ac, got %v", bogus, sortedKeys(got)) + } + } +} + +func TestDirlist_ExtensionExpansion(t *testing.T) { + // the server only answers config.php; the bare word and other extensions hit + // the catch-all soft-404, so -e must be what surfaces config.php. + const realBody = "" + mux := http.NewServeMux() + mux.HandleFunc("/config.php", func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte(realBody)) + }) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + http.NotFound(w, r) // hard 404 for everything but config.php + }) + srv := httptest.NewServer(mux) + defer srv.Close() + + dir := t.TempDir() + wordlist := filepath.Join(dir, "words.txt") + if err := os.WriteFile(wordlist, []byte("config\n"), 0o600); err != nil { + t.Fatalf("write wordlist: %v", err) + } + + results, err := Dirlist("small", srv.URL, 5*time.Second, 2, "", DirlistOptions{ + Wordlist: wordlist, + Extensions: "php,bak", + }) + if err != nil { + t.Fatalf("Dirlist: %v", err) + } + + got := pathSet(results) + if !has(got, "/config.php") { + t.Errorf("expected -e to surface config.php, got %v", sortedKeys(got)) + } + if has(got, "/config") || has(got, "/config.bak") { + t.Errorf("only config.php exists; bare word and .bak are 404s, got %v", sortedKeys(got)) + } +} + +func TestDirlist_LocalWordlistOverridesSize(t *testing.T) { + // a local -w must be used verbatim and never touch directoryURL; point the + // remote at a sink that fails the test if it's ever hit. + orig := directoryURL + directoryURL = "http://127.0.0.1:0/should-not-be-fetched/" + defer func() { directoryURL = orig }() + + mux := http.NewServeMux() + mux.HandleFunc("/secret", func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte("top secret area found")) + }) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + http.NotFound(w, r) + }) + srv := httptest.NewServer(mux) + defer srv.Close() + + dir := t.TempDir() + wordlist := filepath.Join(dir, "custom.txt") + if err := os.WriteFile(wordlist, []byte("secret\nabsent\n"), 0o600); err != nil { + t.Fatalf("write wordlist: %v", err) + } + + results, err := Dirlist("large", srv.URL, 5*time.Second, 2, "", DirlistOptions{Wordlist: wordlist}) + if err != nil { + t.Fatalf("Dirlist: %v", err) + } + + got := pathSet(results) + if !has(got, "/secret") { + t.Errorf("expected the custom wordlist to find /secret, got %v", sortedKeys(got)) + } + if has(got, "/absent") { + t.Errorf("/absent is a 404 and should not surface, got %v", sortedKeys(got)) + } +} + +// pathSet collects each result's url path for membership checks. it reuses the +// package-level sortedKeys (crawl.go) for deterministic failure output. +func pathSet(results DirectoryResults) map[string]struct{} { + set := make(map[string]struct{}, len(results)) + for i := 0; i < len(results); i++ { + if idx := strings.Index(results[i].Url, "://"); idx >= 0 { + rest := results[i].Url[idx+len("://"):] + if slash := strings.Index(rest, "/"); slash >= 0 { + set[rest[slash:]] = struct{}{} + continue + } + } + set[results[i].Url] = struct{}{} + } + return set +} + +// has is a tiny readability helper for set membership in assertions. +func has(set map[string]struct{}, key string) bool { + _, ok := set[key] + return ok +} diff --git a/internal/scan/integration_test.go b/internal/scan/integration_test.go index 9e0072f..0e86075 100644 --- a/internal/scan/integration_test.go +++ b/internal/scan/integration_test.go @@ -134,7 +134,7 @@ func TestIntegrationDirlist(t *testing.T) { directoryURL = srv.URL + "/" defer func() { directoryURL = orig }() - results, err := Dirlist("small", srv.URL, 5*time.Second, 3, "") + results, err := Dirlist("small", srv.URL, 5*time.Second, 3, "", DirlistOptions{}) if err != nil { t.Fatalf("Dirlist: %v", err) } diff --git a/man/sif.1 b/man/sif.1 index bb160a7..f763167 100644 --- a/man/sif.1 +++ b/man/sif.1 @@ -38,6 +38,30 @@ file with one url per line. .BR \-dirlist " \fIsize\fR" directory and file fuzzing (small/medium/large). .TP +.BR \-mc " \fIcodes\fR" +dirlist: match only these status codes (comma list, e.g. 200,301). +.TP +.BR \-fc " \fIcodes\fR" +dirlist: filter out these status codes (comma list). +.TP +.BR \-fs " \fIsizes\fR" +dirlist: filter out responses of these body sizes (comma list). +.TP +.BR \-fw " \fIcounts\fR" +dirlist: filter out responses with these word counts (comma list). +.TP +.BR \-fr " \fIregex\fR" +dirlist: filter out responses whose body matches this regex. +.TP +.B \-ac +dirlist: auto\-calibrate the soft\-404 wildcard baseline so catch\-all 200s are dropped. +.TP +.BR \-w " \fIpath|url\fR" +dirlist: custom wordlist (local file or url); overrides the \fB\-dirlist\fR size. +.TP +.BR \-e " \fIexts\fR" +dirlist: extensions appended to each word (comma list, e.g. php,bak,env). +.TP .BR \-dnslist " \fIsize\fR" subdomain enumeration (small/medium/large). .TP diff --git a/sif.go b/sif.go index bb6b705..90d0174 100644 --- a/sif.go +++ b/sif.go @@ -231,11 +231,20 @@ func (app *App) Run() error { } if app.settings.Dirlist != "none" { - result, err := scan.Dirlist(app.settings.Dirlist, url, app.settings.Timeout, app.settings.Threads, app.settings.LogDir) + result, err := scan.Dirlist(app.settings.Dirlist, url, app.settings.Timeout, app.settings.Threads, app.settings.LogDir, scan.DirlistOptions{ + MatchCodes: app.settings.DirMatchCodes, + FilterCodes: app.settings.DirFilterCodes, + FilterSizes: app.settings.DirFilterSizes, + FilterWords: app.settings.DirFilterWords, + FilterRegex: app.settings.DirFilterRegex, + Calibrate: app.settings.DirCalibrate, + Wordlist: app.settings.DirWordlist, + Extensions: app.settings.DirExtensions, + }) if err != nil { log.Errorf("Error while running directory scan: %s", err) } else { - moduleResults = append(moduleResults, ModuleResult{"dirlist", result}) + moduleResults = append(moduleResults, NewModuleResult(result)) scansRun = append(scansRun, "Directory Listing") } }