diff --git a/CHANGELOG.md b/CHANGELOG.md
index b035aaf8..47a84ebe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 ### New Features
 
+- engine: short circuit logic nodes for better performance #824 @williballenthin
+- engine: add optimizer the order faster nodes first #829 @williballenthin
 - engine: optimize rule evaluation by skipping rules that can't match #830 @williballenthin
 
 ### Breaking Changes
diff --git a/capa/engine.py b/capa/engine.py
index 9591a2c5..2dbb3574 100644
--- a/capa/engine.py
+++ b/capa/engine.py
@@ -46,9 +46,12 @@ class Statement:
     def __repr__(self):
         return str(self)
 
-    def evaluate(self, features: FeatureSet) -> Result:
+    def evaluate(self, features: FeatureSet, short_circuit=True) -> Result:
         """
         classes that inherit `Statement` must implement `evaluate`
+
+        args:
+            short_circuit (bool): if true, then statements like and/or/some may short circuit.
         """
         raise NotImplementedError()
 
@@ -73,35 +76,69 @@ class Statement:
 
 
 class And(Statement):
-    """match if all of the children evaluate to True."""
+    """
+    match if all of the children evaluate to True.
+
+    the order of evaluation is dictated by the property
+    `And.children` (type: List[Statement|Feature]).
+    a query optimizer may safely manipulate the order of these children.
+    """
 
     def __init__(self, children, description=None):
         super(And, self).__init__(description=description)
         self.children = children
 
-    def evaluate(self, ctx):
+    def evaluate(self, ctx, short_circuit=True):
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.and"] += 1
 
-        results = [child.evaluate(ctx) for child in self.children]
-        success = all(results)
-        return Result(success, self, results)
+        if short_circuit:
+            results = []
+            for child in self.children:
+                result = child.evaluate(ctx, short_circuit=short_circuit)
+                results.append(result)
+                if not result:
+                    # short circuit
+                    return Result(False, self, results)
+
+            return Result(True, self, results)
+        else:
+            results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children]
+            success = all(results)
+            return Result(success, self, results)
 
 
 class Or(Statement):
-    """match if any of the children evaluate to True."""
+    """
+    match if any of the children evaluate to True.
+
+    the order of evaluation is dictated by the property
+    `Or.children` (type: List[Statement|Feature]).
+    a query optimizer may safely manipulate the order of these children.
+    """
 
     def __init__(self, children, description=None):
         super(Or, self).__init__(description=description)
         self.children = children
 
-    def evaluate(self, ctx):
+    def evaluate(self, ctx, short_circuit=True):
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.or"] += 1
 
-        results = [child.evaluate(ctx) for child in self.children]
-        success = any(results)
-        return Result(success, self, results)
+        if short_circuit:
+            results = []
+            for child in self.children:
+                result = child.evaluate(ctx, short_circuit=short_circuit)
+                results.append(result)
+                if result:
+                    # short circuit as soon as we hit one match
+                    return Result(True, self, results)
+
+            return Result(False, self, results)
+        else:
+            results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children]
+            success = any(results)
+            return Result(success, self, results)
 
 
 class Not(Statement):
@@ -111,34 +148,55 @@ class Not(Statement):
         super(Not, self).__init__(description=description)
         self.child = child
 
-    def evaluate(self, ctx):
+    def evaluate(self, ctx, short_circuit=True):
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.not"] += 1
 
-        results = [self.child.evaluate(ctx)]
+        results = [self.child.evaluate(ctx, short_circuit=short_circuit)]
         success = not results[0]
         return Result(success, self, results)
 
 
 class Some(Statement):
-    """match if at least N of the children evaluate to True."""
+    """
+    match if at least N of the children evaluate to True.
+
+    the order of evaluation is dictated by the property
+    `Some.children` (type: List[Statement|Feature]).
+    a query optimizer may safely manipulate the order of these children.
+    """
 
     def __init__(self, count, children, description=None):
         super(Some, self).__init__(description=description)
         self.count = count
         self.children = children
 
-    def evaluate(self, ctx):
+    def evaluate(self, ctx, short_circuit=True):
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.some"] += 1
 
-        results = [child.evaluate(ctx) for child in self.children]
-        # note that here we cast the child result as a bool
-        # because we've overridden `__bool__` above.
-        #
-        # we can't use `if child is True` because the instance is not True.
-        success = sum([1 for child in results if bool(child) is True]) >= self.count
-        return Result(success, self, results)
+        if short_circuit:
+            results = []
+            satisfied_children_count = 0
+            for child in self.children:
+                result = child.evaluate(ctx, short_circuit=short_circuit)
+                results.append(result)
+                if result:
+                    satisfied_children_count += 1
+
+                if satisfied_children_count >= self.count:
+                    # short circuit as soon as we hit the threshold
+                    return Result(True, self, results)
+
+            return Result(False, self, results)
+        else:
+            results = [child.evaluate(ctx, short_circuit=short_circuit) for child in self.children]
+            # note that here we cast the child result as a bool
+            # because we've overridden `__bool__` above.
+            #
+            # we can't use `if child is True` because the instance is not True.
+            success = sum([1 for child in results if bool(child) is True]) >= self.count
+            return Result(success, self, results)
 
 
 class Range(Statement):
@@ -150,7 +208,7 @@ class Range(Statement):
         self.min = min if min is not None else 0
         self.max = max if max is not None else (1 << 64 - 1)
 
-    def evaluate(self, ctx):
+    def evaluate(self, ctx, **kwargs):
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.range"] += 1
 
@@ -178,7 +236,7 @@ class Subscope(Statement):
         self.scope = scope
         self.child = child
 
-    def evaluate(self, ctx):
+    def evaluate(self, ctx, **kwargs):
         raise ValueError("cannot evaluate a subscope directly!")
 
 
@@ -241,8 +299,18 @@ def match(rules: List["capa.rules.Rule"], features: FeatureSet, va: int) -> Tupl
     features = collections.defaultdict(set, copy.copy(features))
 
     for rule in rules:
-        res = rule.evaluate(features)
+        res = rule.evaluate(features, short_circuit=True)
         if res:
+            # we first matched the rule with short circuiting enabled.
+            # this is much faster than without short circuiting.
+            # however, we want to collect all results thoroughly,
+            # so once we've found a match quickly,
+            # go back and capture results without short circuiting.
+            res = rule.evaluate(features, short_circuit=False)
+
+            # sanity check
+            assert bool(res) is True
+
             results[rule.name].append((va, res))
             # we need to update the current `features`
             # because subsequent iterations of this loop may use newly added features,
diff --git a/capa/features/common.py b/capa/features/common.py
index 3a4e71e9..6b867766 100644
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -146,7 +146,7 @@ class Feature:
     def __repr__(self):
         return str(self)
 
-    def evaluate(self, ctx: Dict["Feature", Set[int]]) -> Result:
+    def evaluate(self, ctx: Dict["Feature", Set[int]], **kwargs) -> Result:
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature." + self.name] += 1
         return Result(self in ctx, self, [], locations=ctx.get(self, []))
@@ -192,7 +192,7 @@ class Substring(String):
         super(Substring, self).__init__(value, description=description)
         self.value = value
 
-    def evaluate(self, ctx):
+    def evaluate(self, ctx, short_circuit=True):
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.substring"] += 1
 
@@ -210,6 +210,10 @@ class Substring(String):
 
             if self.value in feature.value:
                 matches[feature.value].extend(locations)
+                if short_circuit:
+                    # we found one matching string, thats sufficient to match.
+                    # don't collect other matching strings in this mode.
+                    break
 
         if matches:
             # finalize: defaultdict -> dict
@@ -280,7 +284,7 @@ class Regex(String):
                 "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value
             )
 
-    def evaluate(self, ctx):
+    def evaluate(self, ctx, short_circuit=True):
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.regex"] += 1
 
@@ -302,6 +306,10 @@ class Regex(String):
             # so that they don't have to prefix/suffix their terms like: /.*foo.*/.
             if self.re.search(feature.value):
                 matches[feature.value].extend(locations)
+                if short_circuit:
+                    # we found one matching string, thats sufficient to match.
+                    # don't collect other matching strings in this mode.
+                    break
 
         if matches:
             # finalize: defaultdict -> dict
@@ -366,7 +374,7 @@ class Bytes(Feature):
         super(Bytes, self).__init__(value, description=description)
         self.value = value
 
-    def evaluate(self, ctx):
+    def evaluate(self, ctx, **kwargs):
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.bytes"] += 1
 
diff --git a/capa/optimizer.py b/capa/optimizer.py
new file mode 100644
index 00000000..0408bf07
--- /dev/null
+++ b/capa/optimizer.py
@@ -0,0 +1,70 @@
+import logging
+
+import capa.engine as ceng
+import capa.features.common
+
+logger = logging.getLogger(__name__)
+
+
+def get_node_cost(node):
+    if isinstance(node, (capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format)):
+        # we assume these are the most restrictive features:
+        # authors commonly use them at the start of rules to restrict the category of samples to inspect
+        return 0
+
+    # elif "everything else":
+    #   return 1
+    #
+    # this should be all hash-lookup features.
+    # see below.
+
+    elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)):
+        # substring and regex features require a full scan of each string
+        # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count).
+        #
+        # TODO: compute the average cost of these feature relative to hash feature
+        # and adjust the factor accordingly.
+        return 2
+
+    elif isinstance(node, (ceng.Not, ceng.Range)):
+        # the cost of these nodes are defined by the complexity of their single child.
+        return 1 + get_node_cost(node.child)
+
+    elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)):
+        # the cost of these nodes is the full cost of their children
+        # as this is the worst-case scenario.
+        return 1 + sum(map(get_node_cost, node.children))
+
+    else:
+        # this should be all hash-lookup features.
+        # we give this a arbitrary weight of 1.
+        # the only thing more "important" than this is checking OS/Arch/Format.
+        return 1
+
+
+def optimize_statement(statement):
+    # this routine operates in-place
+
+    if isinstance(statement, (ceng.And, ceng.Or, ceng.Some)):
+        # has .children
+        statement.children = sorted(statement.children, key=lambda n: get_node_cost(n))
+        return
+    elif isinstance(statement, (ceng.Not, ceng.Range)):
+        # has .child
+        optimize_statement(statement.child)
+        return
+    else:
+        # appears to be "simple"
+        return
+
+
+def optimize_rule(rule):
+    # this routine operates in-place
+    optimize_statement(rule.statement)
+
+
+def optimize_rules(rules):
+    logger.debug("optimizing %d rules", len(rules))
+    for rule in rules:
+        optimize_rule(rule)
+    return rules
diff --git a/capa/rules.py b/capa/rules.py
index 3646117a..0eaaeaee 100644
--- a/capa/rules.py
+++ b/capa/rules.py
@@ -31,6 +31,7 @@ import ruamel.yaml
 import capa.perf
 import capa.engine as ceng
 import capa.features
+import capa.optimizer
 import capa.features.file
 import capa.features.insn
 import capa.features.common
@@ -627,10 +628,10 @@ class Rule:
         for new_rule in self._extract_subscope_rules_rec(self.statement):
             yield new_rule
 
-    def evaluate(self, features: FeatureSet):
+    def evaluate(self, features: FeatureSet, short_circuit=True):
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.rule"] += 1
-        return self.statement.evaluate(features)
+        return self.statement.evaluate(features, short_circuit=short_circuit)
 
     @classmethod
     def from_dict(cls, d, definition):
@@ -968,6 +969,8 @@ class RuleSet:
         if len(rules) == 0:
             raise InvalidRuleSet("no rules selected")
 
+        rules = capa.optimizer.optimize_rules(rules)
+
         self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE)
         self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE)
         self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE)
diff --git a/scripts/profile-time.py b/scripts/profile-time.py
index 3c47b67b..3d6b5e07 100644
--- a/scripts/profile-time.py
+++ b/scripts/profile-time.py
@@ -1,3 +1,34 @@
+"""
+Invoke capa multiple times and record profiling informations.
+Use the --number and --repeat options to change the number of iterations.
+By default, the script will emit a markdown table with a label pulled from git.
+
+Note: you can run this script against pre-generated .frz files to reduce the startup time.
+
+usage:
+
+    usage: profile-time.py [--number NUMBER] [--repeat REPEAT] [--label LABEL] sample
+
+    Profile capa performance
+
+    positional arguments:
+      sample                path to sample to analyze
+
+    optional arguments:
+      --number NUMBER       batch size of profile collection
+      --repeat REPEAT       batch count of profile collection
+      --label LABEL         description of the profile collection
+
+example:
+
+    $ python profile-time.py ./tests/data/kernel32.dll_.frz --number 1 --repeat 2
+
+    | label                                | count(evaluations)   | avg(time)   | min(time)   | max(time)   |
+    |--------------------------------------|----------------------|-------------|-------------|-------------|
+    | 18c30e4 main: remove perf debug msgs | 66,561,622           | 132.13s     | 125.14s     | 139.12s     |
+
+      ^^^ --label or git hash               
+"""
 import sys
 import timeit
 import logging
@@ -98,12 +129,16 @@ def main(argv=None):
                 (
                     args.label,
                     "{:,}".format(capa.perf.counters["evaluate.feature"]),
-                    "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)),
+                    # python documentation indicates that min(samples) should be preferred,
+                    # so lets put that first.
+                    #
+                    # https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat
                     "%0.2fs" % (min(samples) / float(args.number)),
+                    "%0.2fs" % (sum(samples) / float(args.repeat) / float(args.number)),
                     "%0.2fs" % (max(samples) / float(args.number)),
                 )
             ],
-            headers=["label", "count(evaluations)", "avg(time)", "min(time)", "max(time)"],
+            headers=["label", "count(evaluations)", "min(time)", "avg(time)", "max(time)"],
             tablefmt="github",
         )
     )
diff --git a/tests/test_engine.py b/tests/test_engine.py
index ce421759..b07c89e6 100644
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@@ -533,3 +533,29 @@ def test_render_offset():
     assert str(capa.features.insn.Offset(1)) == "offset(0x1)"
     assert str(capa.features.insn.Offset(1, bitness=capa.features.common.BITNESS_X32)) == "offset/x32(0x1)"
     assert str(capa.features.insn.Offset(1, bitness=capa.features.common.BITNESS_X64)) == "offset/x64(0x1)"
+
+
+def test_short_circuit():
+    assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True
+
+    # with short circuiting, only the children up until the first satisfied child are captured.
+    assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}, short_circuit=True).children) == 1
+    assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}, short_circuit=False).children) == 2
+
+
+def test_eval_order():
+    # base cases.
+    assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}) == True
+    assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}) == True
+
+    # with short circuiting, only the children up until the first satisfied child are captured.
+    assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children) == 1
+    assert len(Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children) == 2
+    assert len(Or([Number(1), Number(2)]).evaluate({Number(1): {1}, Number(2): {1}}).children) == 1
+
+    # and its guaranteed that children are evaluated in order.
+    assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children[0].statement == Number(1)
+    assert Or([Number(1), Number(2)]).evaluate({Number(1): {1}}).children[0].statement != Number(2)
+
+    assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children[1].statement == Number(2)
+    assert Or([Number(1), Number(2)]).evaluate({Number(2): {1}}).children[1].statement != Number(1)
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
new file mode 100644
index 00000000..69a79bd6
--- /dev/null
+++ b/tests/test_optimizer.py
@@ -0,0 +1,65 @@
+# Copyright (C) 2021 FireEye, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+import textwrap
+
+import pytest
+
+import capa.rules
+import capa.engine
+import capa.optimizer
+import capa.features.common
+from capa.engine import Or, And
+from capa.features.insn import Mnemonic
+from capa.features.common import Arch, Bytes, Substring
+
+
+def test_optimizer_order():
+    rule = textwrap.dedent(
+        """
+        rule:
+            meta:
+                name: test rule
+                scope: function
+            features:
+                - and:
+                    - substring: "foo"
+                    - arch: amd64
+                    - mnemonic: cmp
+                    - and:
+                      - bytes: 3
+                      - offset: 2
+                    - or:
+                      - number: 1
+                      - offset: 4
+        """
+    )
+    r = capa.rules.Rule.from_yaml(rule)
+
+    # before optimization
+    children = list(r.statement.get_children())
+    assert isinstance(children[0], Substring)
+    assert isinstance(children[1], Arch)
+    assert isinstance(children[2], Mnemonic)
+    assert isinstance(children[3], And)
+    assert isinstance(children[4], Or)
+
+    # after optimization
+    capa.optimizer.optimize_rules([r])
+    children = list(r.statement.get_children())
+
+    # cost: 0
+    assert isinstance(children[0], Arch)
+    # cost: 1
+    assert isinstance(children[1], Mnemonic)
+    # cost: 2
+    assert isinstance(children[2], Substring)
+    # cost: 3
+    assert isinstance(children[3], Or)
+    # cost: 4
+    assert isinstance(children[4], And)