From 58ca02846b63c37672c56e2a040a8f8255fa7662 Mon Sep 17 00:00:00 2001
From: Ad Schellevis <ad@opnsense.org>
Date: Sun, 16 Oct 2016 19:26:55 +0200
Subject: [PATCH] (IDS) extend support for compressed files with "inline::"
 keyword to be able to download parts of an archive (required to download
 snort rules). Change processing order to always stream data to tempfiles
 first, so we can reuse downloaded files (an archive scan isn't very
 expensive)

---
 .../scripts/suricata/lib/downloader.py        | 43 +++++++++++++------
 src/opnsense/scripts/suricata/lib/metadata.py |  7 ++-
 src/opnsense/scripts/suricata/rule-updater.py |  2 +-
 3 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/src/opnsense/scripts/suricata/lib/downloader.py b/src/opnsense/scripts/suricata/lib/downloader.py
index d530975b7..61e497271 100644
--- a/src/opnsense/scripts/suricata/lib/downloader.py
+++ b/src/opnsense/scripts/suricata/lib/downloader.py
@@ -40,6 +40,7 @@ import requests
 class Downloader(object):
     def __init__(self, target_dir):
         self._target_dir = target_dir
+        self._download_cache = dict()
 
     def filter(self, in_data, filter_type):
         """ apply input filter to downloaded data
@@ -68,8 +69,15 @@ class Downloader(object):
         return '\n'.join(output)
 
     @staticmethod
-    def _unpack(req_obj, source_url):
-        source_url = source_url.strip().lower()
+    def _unpack(src, source_url, filename=None):
+        """ unpack data if archived
+            :param src: handle to temp file
+            :param source_url: location where file was downloaded from
+            :param filename: filename to extract
+            :return: text
+        """
+        src.seek(0)
+        source_url = source_url.strip().lower().split('?')[0]
         unpack_type=None
         if source_url.endswith('.tar.gz') or source_url.endswith('.tgz'):
             unpack_type = 'tar'
@@ -80,15 +88,14 @@ class Downloader(object):
 
         if unpack_type is not None:
             rule_content = list()
-            # flush to temp
-            src = tempfile.NamedTemporaryFile()
-            shutil.copyfileobj(req_obj.raw, src)
-            src.seek(0)
             # handle compression types
             if unpack_type == 'tar':
                 tf = tarfile.open(fileobj=src)
                 for tf_file in tf.getmembers():
-                    if tf_file.isfile() and tf_file.name.lower().endswith('.rules'):
+                    # extract partial or all (*.rules) from archive
+                    if filename is not None and tf_file.name == filename:
+                        rule_content.append(tf.extractfile(tf_file).read())
+                    elif filename is None and tf_file.isfile() and tf_file.name.lower().endswith('.rules'):
                         rule_content.append(tf.extractfile(tf_file).read())
             elif unpack_type == 'gz':
                 gf = gzip.GzipFile(mode='r', fileobj=src)
@@ -96,13 +103,15 @@ class Downloader(object):
             elif unpack_type == 'zip':
                 with zipfile.ZipFile(src, mode='r', compression=zipfile.ZIP_DEFLATED) as zf:
                     for item in zf.infolist():
-                        if item.file_size > 0 and item.filename.lower().endswith('.rules'):
+                        if filename is not None and item.filename == filename:
+                            rule_content.append(zf.open(item).read())
+                        elif filename is None and item.file_size > 0 and item.filename.lower().endswith('.rules'):
                             rule_content.append(zf.open(item).read())
             return '\n'.join(rule_content)
         else:
-            return req_obj.text
+            return src.read()
 
-    def download(self, proto, url, filename, input_filter):
+    def download(self, proto, url, url_filename, filename, input_filter):
         """ download ruleset file
             :param proto: protocol (http,https)
             :param url: download url
@@ -111,11 +120,19 @@ class Downloader(object):
         """
         if proto in ('http', 'https'):
             frm_url = url.replace('//', '/').replace(':/', '://')
-            req = requests.get(url=frm_url, stream=True)
-            if req.status_code == 200:
+            # stream to temp file
+            if frm_url not in self._download_cache:
+                req = requests.get(url=frm_url, stream=True)
+                if req.status_code == 200:
+                    src = tempfile.NamedTemporaryFile()
+                    shutil.copyfileobj(req.raw, src)
+                    self._download_cache[frm_url] = src
+
+            # process rules from tempfile (prevent duplicate download for files within an archive)
+            if frm_url in self._download_cache:
                 try:
                     target_filename = '%s/%s' % (self._target_dir, filename)
-                    save_data = self._unpack(req, url)
+                    save_data = self._unpack(self._download_cache[frm_url], url, url_filename)
                     save_data = self.filter(save_data, input_filter)
                     open(target_filename, 'wb').write(save_data)
                 except IOError:
diff --git a/src/opnsense/scripts/suricata/lib/metadata.py b/src/opnsense/scripts/suricata/lib/metadata.py
index a717e16fb..a848a6594 100644
--- a/src/opnsense/scripts/suricata/lib/metadata.py
+++ b/src/opnsense/scripts/suricata/lib/metadata.py
@@ -68,7 +68,12 @@ class Metadata(object):
                         metadata_record['documentation_url'] = documentation_url
                         metadata_record['source'] = src_location.attrib
                         metadata_record['filename'] = rule_filename.text.strip()
-                        if 'url' in rule_filename.attrib:
+                        # for an archive, define file to extract
+                        metadata_record['url_filename'] = None
+                        if 'url' in rule_filename.attrib and rule_filename.attrib['url'].startswith('inline::'):
+                            metadata_record['url'] = (metadata_record['source']['url'])
+                            metadata_record['url_filename'] = rule_filename.attrib['url'][8:]
+                        elif 'url' in rule_filename.attrib:
                             metadata_record['url'] = (rule_filename.attrib['url'])
                         else:
                             metadata_record['url'] = ('%s/%s' % (metadata_record['source']['url'],
diff --git a/src/opnsense/scripts/suricata/rule-updater.py b/src/opnsense/scripts/suricata/rule-updater.py
index f388b243b..5eb60030d 100755
--- a/src/opnsense/scripts/suricata/rule-updater.py
+++ b/src/opnsense/scripts/suricata/rule-updater.py
@@ -78,5 +78,5 @@ if __name__ == '__main__':
                         pass
                 else:
                     input_filter = enabled_rulefiles[rule['filename']]['filter']
-                    dl.download(proto=download_proto, url=rule['url'],
+                    dl.download(proto=download_proto, url=rule['url'], url_filename=rule['url_filename'],
                                 filename=rule['filename'], input_filter=input_filter)