From 0bdbdf7d2188ee8f6067d1ae0cf275567493bc1e Mon Sep 17 00:00:00 2001
From: Ad Schellevis <ad@opnsense.org>
Date: Thu, 16 May 2019 13:54:24 +0200
Subject: [PATCH] netflow, improve log parser performance, cache some
 calculations.

In theory we could probably increase performance even more by limitting the number of calls to struct.unpack, but this will make it more difficult to read.
This version is about 30% faster then the original C version shipped with flowd.

Some more testing todo, but preliminary results look good.
---
 .../scripts/netflow/lib/flowparser.py         | 33 ++++++++++++-------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/opnsense/scripts/netflow/lib/flowparser.py b/src/opnsense/scripts/netflow/lib/flowparser.py
index 6045f2122..9411767e9 100644
--- a/src/opnsense/scripts/netflow/lib/flowparser.py
+++ b/src/opnsense/scripts/netflow/lib/flowparser.py
@@ -79,15 +79,21 @@ class FlowParser:
 
     def __init__(self, filename):
         self._filename = filename
+        # cache formatter vs byte length
+        self._fmt_cache = dict()
+        # pre-calculate powers of 2
+        self._pow = dict()
+        for idx in range(len(self.field_definition_order)):
+            self._pow[idx] = pow(2, idx)
 
-    @staticmethod
-    def calculate_size(fmt):
-        fmts = {'B': 1, 'H': 2, 'I': 4, 'Q': 8}
-        result = 0
-        for key in fmt:
-            if key in fmts:
-                result += fmts[key]
-        return result
+    def calculate_size(self, fmt):
+        if fmt not in self._fmt_cache:
+            fmts = {'B': 1, 'H': 2, 'I': 4, 'Q': 8}
+            self._fmt_cache[fmt] = 0
+            for key in fmt:
+                if key in fmts:
+                    self._fmt_cache[fmt] += fmts[key]
+        return self._fmt_cache[fmt]
 
     def _parse_binary(self, raw_data, data_fields):
         """ parse binary record
@@ -98,11 +104,14 @@ class FlowParser:
         raw_data_idx = 0
         raw_record = dict()
         for idx in range(len(self.field_definition_order)):
-            if pow(2, idx) & data_fields:
+            if self._pow[idx] & data_fields:
                 fieldname = self.field_definition_order[idx]
                 if fieldname in self.field_definition:
                     fsize = self.calculate_size(self.field_definition[fieldname])
-                    content = struct.unpack(self.field_definition[fieldname], raw_data[raw_data_idx:raw_data_idx + fsize])
+                    content = struct.unpack(
+                        self.field_definition[fieldname],
+                        raw_data[raw_data_idx:raw_data_idx + fsize]
+                    )
                     raw_record[fieldname] = content[0] if len(content) == 1 else content
                     raw_data_idx += fsize
 
@@ -140,9 +149,9 @@ class FlowParser:
                 # concat ipv4/v6 fields into field without [4,6]
                 for key in self.field_definition_order:
                     if key in record:
-                        if key.endswith('4') and len(record[key]) == 4:
+                        if key[-1] == '4' and len(record[key]) == 4:
                             record[key[:-1]] = ip_formatv4.format(*record[key])
-                        elif key.endswith('6') and len(record[key]) == 16:
+                        elif key[-1] == '6' and len(record[key]) == 16:
                             record[key[:-1]] = ip_formatv6.format(*record[key])
 
                 # calculated values