rules: introduce helper to parse features from parts

This commit is contained in:
Willi Ballenthin
2026-04-13 15:40:08 +02:00
parent b9f830619d
commit c6e2d4f89d
3 changed files with 220 additions and 143 deletions

View File

@@ -369,7 +369,7 @@ def translate_com_feature(com_name: str, com_type: ComType) -> ceng.Statement:
def parse_int(s: str) -> int:
if s.startswith("0x"):
if s.startswith(("0x", "-0x")):
return int(s, 0x10)
else:
return int(s, 10)
@@ -632,6 +632,213 @@ def is_subscope_compatible(scope: Scope | None, subscope: Scope) -> bool:
raise ValueError("unexpected scope")
def build_feature(
key: str, initial_value: str | int, initial_description: str | None = None
) -> Feature | ceng.Range | ceng.Statement:
"""
from a key-value pair, like ("number": "12 = Foo"), return a Feature (or Range or Statement).
parses the description from the value, or uses the initial_description if provided.
returns: Feature usually, or Range for count(...) features, or Statement for COM-derived featues.
"""
if key.startswith("count(") and key.endswith(")"):
# e.g.:
#
# count(basic block)
# count(mnemonic(mov))
# count(characteristic(nzxor))
term = key[len("count(") : -len(")")]
# when looking for the existence of such a feature, our rule might look like:
# - mnemonic: mov
#
# but here we deal with the form: `mnemonic(mov)`.
term, _, arg = term.partition("(")
Feature = parse_feature(term)
if arg:
arg = arg[: -len(")")]
# can't rely on yaml parsing ints embedded within strings
# like:
#
# count(offset(0xC))
# count(number(0x11223344))
# count(number(0x100 = description))
if term != "string":
value, description = parse_description(arg, term)
if term == "api":
if not isinstance(value, str):
raise InvalidRule(f"unexpected {term} value type: {type(value)}")
value = trim_dll_part(value)
feature = Feature(value, description=description) # type: ignore[call-arg] # Feature is a runtime union; constructor args vary per subclass
else:
# arg is string (which doesn't support inline descriptions), like:
#
# count(string(error))
#
# known problem that embedded newlines may not work here?
# this may become a problem (or not), so address it when encountered.
feature = Feature(arg)
else:
feature = Feature() # type: ignore[call-arg] # Feature is a runtime union; constructor args vary per subclass
# initial value might be things like:
# - 10
# - "10"
# - "10 or more"
count: int | str = initial_value
if isinstance(count, int):
return ceng.Range(feature, min=count, max=count, description=initial_description)
elif count.endswith(" or more"):
min = parse_int(count[: -len(" or more")])
max = None
return ceng.Range(feature, min=min, max=max, description=initial_description)
elif count.endswith(" or fewer"):
min = None
max = parse_int(count[: -len(" or fewer")])
return ceng.Range(feature, min=min, max=max, description=initial_description)
elif count.startswith("("):
min, max = parse_range(count)
return ceng.Range(feature, min=min, max=max, description=initial_description)
else:
try:
# convert "10" -> 10
count = parse_int(count)
except ValueError:
raise InvalidRule(f"unexpected range: {count}")
return ceng.Range(feature, min=count, max=count, description=initial_description)
elif key == "string" and not isinstance(initial_value, str):
raise InvalidRule(f"ambiguous string value {initial_value}, must be defined as explicit string")
elif key.startswith("operand[") and key.endswith("].number"):
try:
index = int(key[len("operand[") : -len("].number")])
except ValueError as e:
raise InvalidRule("operand index must be an integer") from e
value, description = parse_description(initial_value, key, description=initial_description)
assert isinstance(value, int)
try:
feature = capa.features.insn.OperandNumber(index, value, description=description)
except ValueError as e:
raise InvalidRule(str(e)) from e
return feature
elif key.startswith("operand[") and key.endswith("].offset"):
try:
index = int(key[len("operand[") : -len("].offset")])
except ValueError as e:
raise InvalidRule("operand index must be an integer") from e
value, description = parse_description(initial_value, key, description=initial_description)
assert isinstance(value, int)
try:
feature = capa.features.insn.OperandOffset(index, value, description=description)
except ValueError as e:
raise InvalidRule(str(e)) from e
return feature
elif (
(key == "os" and initial_value not in capa.features.common.VALID_OS)
or (key == "format" and initial_value not in capa.features.common.VALID_FORMAT)
or (key == "arch" and initial_value not in capa.features.common.VALID_ARCH)
):
raise InvalidRule(f"unexpected {key} value {initial_value}")
elif key.startswith("property/"):
access = key[len("property/") :]
if access not in capa.features.common.VALID_FEATURE_ACCESS:
raise InvalidRule(f"unexpected {key} access {access}")
value, description = parse_description(initial_value, key, description=initial_description)
if not isinstance(value, str):
raise InvalidRule(f"unexpected {key} value type: {type(value)}")
try:
feature = capa.features.insn.Property(value, access=access, description=description)
except ValueError as e:
raise InvalidRule(str(e)) from e
return feature
elif key.startswith("com/"):
com_type_name = str(key[len("com/") :])
try:
com_type = ComType(com_type_name)
except ValueError:
raise InvalidRule(f"unexpected COM type: {com_type_name}")
value, description = parse_description(initial_value, key, description=initial_description)
if not isinstance(value, str):
raise InvalidRule(f"unexpected {key} value type: {type(value)}")
return translate_com_feature(value, com_type)
else:
Feature = parse_feature(key)
value, description = parse_description(initial_value, key, description=initial_description)
try:
match Feature:
case capa.features.insn.OperandNumber | capa.features.insn.OperandOffset:
raise RuntimeError("should be impossible")
case capa.features.insn.Offset | capa.features.insn.Number:
assert isinstance(value, int)
return Feature(value, description=description)
case capa.features.insn.API:
assert isinstance(value, str)
# users can specify an API name with or without the DLL part (e.g. `CreateFileA` or `kernel32.CreateFileA`)
# and capa matches only the API name part, not the DLL part.
# the DLL name is ignored, its essentially just for human-oriented documentation.
# see #1824
value = trim_dll_part(value)
return Feature(value, description=description)
case capa.features.insn.Mnemonic:
assert isinstance(value, str)
return Feature(value, description=description)
case capa.features.basicblock.BasicBlock:
return Feature(description=description)
case (
capa.features.file.Export
| capa.features.file.Import
| capa.features.file.Section
| capa.features.file.FunctionName
):
assert isinstance(value, str)
return Feature(value, description=description)
case capa.features.common.MatchedRule | capa.features.common.Characteristic:
assert isinstance(value, str)
return Feature(value, description=description)
case capa.features.common.StringFactory | capa.features.common.Substring:
assert isinstance(value, str)
return Feature(value, description=description)
case capa.features.common.Class | capa.features.common.Namespace | capa.features.insn.Property:
assert isinstance(value, str)
return Feature(value, description=description)
case capa.features.common.Arch | capa.features.common.OS | capa.features.common.Format:
assert isinstance(value, str)
return Feature(value, description=description)
case capa.features.common.Bytes:
assert isinstance(value, bytes)
return Feature(value, description=description)
case _ as unreachable:
assert_never(unreachable)
except ValueError as e:
raise InvalidRule(str(e)) from e
def build_statements(d, scopes: Scopes):
if len(d.keys()) > 2:
raise InvalidRule("too many statements")
@@ -770,149 +977,19 @@ def build_statements(d, scopes: Scopes):
return ceng.Subscope(Scope.INSTRUCTION, statements, description=description)
elif key.startswith("count(") and key.endswith(")"):
# e.g.:
#
# count(basic block)
# count(mnemonic(mov))
# count(characteristic(nzxor))
term = key[len("count(") : -len(")")]
# when looking for the existence of such a feature, our rule might look like:
# - mnemonic: mov
#
# but here we deal with the form: `mnemonic(mov)`.
term, _, arg = term.partition("(")
Feature = parse_feature(term)
if arg:
arg = arg[: -len(")")]
# can't rely on yaml parsing ints embedded within strings
# like:
#
# count(offset(0xC))
# count(number(0x11223344))
# count(number(0x100 = description))
if term != "string":
value, description = parse_description(arg, term)
if term == "api":
if not isinstance(value, str):
raise InvalidRule(f"unexpected {term} value type: {type(value)}")
value = trim_dll_part(value)
feature = Feature(value, description=description) # type: ignore[call-arg] # Feature is a runtime union; constructor args vary per subclass
else:
# arg is string (which doesn't support inline descriptions), like:
#
# count(string(error))
#
# known problem that embedded newlines may not work here?
# this may become a problem (or not), so address it when encountered.
feature = Feature(arg)
else:
feature = Feature() # type: ignore[call-arg] # Feature is a runtime union; constructor args vary per subclass
ensure_feature_valid_for_scopes(scopes, feature) # type: ignore[arg-type] # StringFactory.__new__ returns Feature subclass at runtime
count = d[key]
if isinstance(count, int):
return ceng.Range(feature, min=count, max=count, description=description)
elif count.endswith(" or more"):
min = parse_int(count[: -len(" or more")])
max = None
return ceng.Range(feature, min=min, max=max, description=description)
elif count.endswith(" or fewer"):
min = None
max = parse_int(count[: -len(" or fewer")])
return ceng.Range(feature, min=min, max=max, description=description)
elif count.startswith("("):
min, max = parse_range(count)
return ceng.Range(feature, min=min, max=max, description=description)
else:
raise InvalidRule(f"unexpected range: {count}")
elif key == "string" and not isinstance(d[key], str):
raise InvalidRule(f"ambiguous string value {d[key]}, must be defined as explicit string")
elif key.startswith("operand[") and key.endswith("].number"):
index = key[len("operand[") : -len("].number")]
try:
index = int(index)
except ValueError as e:
raise InvalidRule("operand index must be an integer") from e
value, description = parse_description(d[key], key, d.get("description"))
assert isinstance(value, int)
try:
feature = capa.features.insn.OperandNumber(index, value, description=description)
except ValueError as e:
raise InvalidRule(str(e)) from e
ensure_feature_valid_for_scopes(scopes, feature)
return feature
elif key.startswith("operand[") and key.endswith("].offset"):
index = key[len("operand[") : -len("].offset")]
try:
index = int(index)
except ValueError as e:
raise InvalidRule("operand index must be an integer") from e
value, description = parse_description(d[key], key, d.get("description"))
assert isinstance(value, int)
try:
feature = capa.features.insn.OperandOffset(index, value, description=description)
except ValueError as e:
raise InvalidRule(str(e)) from e
ensure_feature_valid_for_scopes(scopes, feature)
return feature
elif (
(key == "os" and d[key] not in capa.features.common.VALID_OS)
or (key == "format" and d[key] not in capa.features.common.VALID_FORMAT)
or (key == "arch" and d[key] not in capa.features.common.VALID_ARCH)
):
raise InvalidRule(f"unexpected {key} value {d[key]}")
elif key.startswith("property/"):
access = key[len("property/") :]
if access not in capa.features.common.VALID_FEATURE_ACCESS:
raise InvalidRule(f"unexpected {key} access {access}")
value, description = parse_description(d[key], key, d.get("description"))
if not isinstance(value, str):
raise InvalidRule(f"unexpected {key} value type: {type(value)}")
try:
feature = capa.features.insn.Property(value, access=access, description=description)
except ValueError as e:
raise InvalidRule(str(e)) from e
ensure_feature_valid_for_scopes(scopes, feature)
return feature
elif key.startswith("com/"):
com_type_name = str(key[len("com/") :])
try:
com_type = ComType(com_type_name)
except ValueError:
raise InvalidRule(f"unexpected COM type: {com_type_name}")
value, description = parse_description(d[key], key, d.get("description"))
if not isinstance(value, str):
raise InvalidRule(f"unexpected {key} value type: {type(value)}")
return translate_com_feature(value, com_type)
else:
Feature = parse_feature(key)
value, description = parse_description(d[key], key, d.get("description"))
initial_value = d[key]
initial_description = d.get("description")
if key == "api":
if not isinstance(value, str):
raise InvalidRule(f"unexpected {key} value type: {type(value)}")
value = trim_dll_part(value)
feature = build_feature(key, initial_value, initial_description)
# for count(...) features, validate the inner feature rather than the Range wrapper.
# for com/... features, translate_com_feature returns a compound Or(String, Bytes) Statement;
if isinstance(feature, ceng.Range):
ensure_feature_valid_for_scopes(scopes, feature.child)
elif isinstance(feature, Feature):
ensure_feature_valid_for_scopes(scopes, feature)
try:
feature = Feature(value, description=description) # type: ignore[misc] # Feature is a runtime union; constructor args vary per subclass
except ValueError as e:
raise InvalidRule(str(e)) from e
ensure_feature_valid_for_scopes(scopes, feature) # type: ignore[arg-type] # StringFactory.__new__ returns Feature subclass at runtime
return feature

2
rules

Submodule rules updated: 2af9fbfc1c...03a20f69ae