Unbound: blocklists: allow scripts to extend blocklist functionality (#6350)

This commit is contained in:
Stephan de Wit 2023-02-24 15:39:40 +01:00 committed by GitHub
parent 5fa74bfafa
commit 09f40f0e33
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 383 additions and 211 deletions

2
plist
View File

@ -976,6 +976,8 @@
/usr/local/opnsense/scripts/system/temperature.sh
/usr/local/opnsense/scripts/system/trigger_config_changed_events.py
/usr/local/opnsense/scripts/unbound/blocklists.py
/usr/local/opnsense/scripts/unbound/blocklists/__init__.py
/usr/local/opnsense/scripts/unbound/blocklists/default_bl.py
/usr/local/opnsense/scripts/unbound/cache.sh
/usr/local/opnsense/scripts/unbound/check.sh
/usr/local/opnsense/scripts/unbound/logger.py

View File

@ -1,7 +1,7 @@
#!/usr/local/bin/python3
"""
Copyright (c) 2020 Ad Schellevis <ad@opnsense.org>
Copyright (c) 2020-2023 Ad Schellevis <ad@opnsense.org>
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -26,215 +26,8 @@
POSSIBILITY OF SUCH DAMAGE.
"""
import os
import sys
import re
import syslog
import tempfile
import time
import fcntl
from configparser import ConfigParser
import requests
import ujson
def uri_reader(uri):
req_opts = {
'url': uri,
'timeout': 5,
'stream': True
}
try:
req = requests.get(**req_opts)
except Exception as e:
syslog.syslog(syslog.LOG_ERR,'blocklist download : unable to download file from %s (error : %s)' % (uri, e))
return
if req.status_code >= 200 and req.status_code <= 299:
req.raw.decode_content = True
prev_chop = ''
while True:
try:
chop = req.raw.read(1024).decode()
if not chop:
if prev_chop:
yield prev_chop
break
else:
parts = (prev_chop + chop).split('\n')
if parts[-1] != "\n":
prev_chop = parts.pop()
else:
prev_chop = ''
for part in parts:
yield part
except Exception as e:
syslog.syslog(syslog.LOG_ERR,'blocklist download : error reading file from %s (error : %s)' % (uri, e))
return
else:
syslog.syslog(syslog.LOG_ERR,
'blocklist download : unable to download file from %s (status_code: %d)' % (uri, req.status_code)
)
from blocklists import BlocklistParser
if __name__ == '__main__':
# check for a running download process, this may take a while so it's better to check...
try:
lck = open('/tmp/unbound-download_blocklists.tmp', 'w+')
fcntl.flock(lck, fcntl.LOCK_EX | fcntl.LOCK_NB)
except IOError:
# already running, exit status 99
sys.exit(99)
domain_pattern = re.compile(
r'^(([\da-zA-Z_])([_\w-]{,62})\.){,127}(([\da-zA-Z])[_\w-]{,61})'
r'?([\da-zA-Z]\.((xn\-\-[a-zA-Z\d]+)|([a-zA-Z\d]{2,})))$'
)
destination_address = '0.0.0.0'
rcode = 'NOERROR'
startup_time = time.time()
syslog.openlog('unbound', facility=syslog.LOG_LOCAL4)
blocklist_items = {
'data': {},
'config': {}
}
skip_download = False
if os.path.exists('/tmp/unbound-blocklists.conf'):
cnf = ConfigParser()
cnf.read('/tmp/unbound-blocklists.conf')
cnf_cache = ConfigParser()
if os.path.exists('/tmp/unbound-blocklists.conf.cache'):
cnf_cache.read('/tmp/unbound-blocklists.conf.cache')
else:
cnf_cache.read('/tmp/unbound-blocklists.conf')
if cnf.sections() and cnf_cache.sections():
# get the difference between the old and new configuration, there won't be any
# if we're starting up, so it will proceed as normal.
diff_cnf = {d: set(map(tuple, v.items())) for d,v in cnf._sections.items()}
diff_cnf_cache = {d: set(map(tuple, v.items())) for d,v in cnf_cache._sections.items()}
diffs_added = {header: diff_cnf[header] - diff_cnf_cache[header] for header, _ in diff_cnf.items()}
diffs_removed = {header: diff_cnf_cache[header] - diff_cnf[header] for header, _ in diff_cnf.items()}
# we can only skip download if the include option has changed, but it must proceed
# if any other option has changed
if (diffs_added['include'] or diffs_removed['include']):
skip_download = True
for (a, r) in zip(diffs_added, diffs_removed):
if (a != 'include' and r != 'include') and (diffs_added[a] or diffs_removed[r]):
skip_download = False
if cnf.has_section('exclude'):
exclude_list = set()
for exclude_item in cnf['exclude']:
pattern = cnf['exclude'][exclude_item]
try:
re.compile(pattern, re.IGNORECASE)
exclude_list.add(pattern)
except re.error:
syslog.syslog(syslog.LOG_ERR,
'blocklist download : skip invalid whitelist exclude pattern "%s" (%s)' % (
exclude_item, pattern
)
)
if not exclude_list:
exclude_list.add('$^')
wp = '|'.join(exclude_list)
whitelist_pattern = re.compile(wp, re.IGNORECASE)
syslog.syslog(syslog.LOG_NOTICE, 'blocklist download : exclude domains matching %s' % wp)
if cnf.has_section('settings'):
if cnf.has_option('settings', 'address'):
blocklist_items['config']['dst_addr'] = cnf.get('settings', 'address')
if cnf.has_option('settings', 'rcode'):
blocklist_items['config']['rcode'] = cnf.get('settings', 'rcode')
if not skip_download:
# fetch all blocklists, will replace the existing file used by Unbound
if cnf.has_section('blocklists'):
for blocklist in cnf['blocklists']:
list_type = blocklist.split('_', 1)
bl_shortcode = 'Custom' if list_type[0] == 'custom' else list_type[1]
file_stats = {'uri': cnf['blocklists'][blocklist], 'skip' : 0, 'blocklist': 0, 'lines' :0}
for line in uri_reader(cnf['blocklists'][blocklist]):
file_stats['lines'] += 1
# cut line into parts before comment marker (if any)
tmp = line.split('#')[0].split()
entry = None
while tmp:
entry = tmp.pop(-1)
if entry not in ['127.0.0.1', '0.0.0.0']:
break
if entry:
domain = entry.lower()
if whitelist_pattern.match(entry):
file_stats['skip'] += 1
else:
if domain_pattern.match(domain):
file_stats['blocklist'] += 1
blocklist_items['data'][entry] = {'bl': bl_shortcode, 'wildcard': False}
else:
file_stats['skip'] += 1
syslog.syslog(
syslog.LOG_NOTICE,
'blocklist download %(uri)s (lines: %(lines)d exclude: %(skip)d block: %(blocklist)d)' % file_stats
)
# after a download, always apply exact custom matches on top of it
if cnf.has_section('include'):
for item in cnf['include']:
entry = cnf['include'][item].rstrip().lower()
if not whitelist_pattern.match(entry):
if domain_pattern.match(entry):
blocklist_items['data'][entry] = {'bl': 'Manual','wildcard': False}
if '*' in entry:
blocklist_items['data'][entry.replace('*.', '')] = {'bl': 'Manual', 'wildcard': True}
else:
# only modify the existing list, administrate on added and removed exact custom matches
syslog.syslog(syslog.LOG_NOTICE, 'blocklist: skip download')
if os.path.exists('/var/unbound/data/dnsbl.json'):
blocklist_items = ujson.load(open('/var/unbound/data/dnsbl.json', 'r'))
if diffs_removed['include']:
for item in diffs_removed['include']:
entry = item[1].replace('*.', '').rstrip().lower()
# include entry may have been overridden by the whitelist, so use pop()
blocklist_items['data'].pop(entry, None)
if diffs_added['include']:
for item in diffs_added['include']:
entry = item[1].rstrip().lower()
if not whitelist_pattern.match(entry):
if domain_pattern.match(entry):
blocklist_items['data'][entry] = {'bl': 'Manual', 'wildcard': False}
if '*' in entry:
blocklist_items['data'][entry.replace('*.', '')] = {'bl': 'Manual', 'wildcard': True}
# check if there are wildcards in the dataset
has_wildcards = False
for item in blocklist_items['data']:
if blocklist_items['data'][item]['wildcard'] == True:
has_wildcards = True
break
blocklist_items['config']['has_wildcards'] = has_wildcards
with open('/tmp/unbound-blocklists.conf.cache', 'w') as cache_config:
# cache the current config so we can diff on it the next time
cnf.write(cache_config)
# write out results
if not os.path.exists('/var/unbound/data'):
os.makedirs('/var/unbound/data')
with open("/var/unbound/data/dnsbl.json.new", 'w') as unbound_outf:
if blocklist_items:
ujson.dump(blocklist_items, unbound_outf)
# atomically replace the current dnsbl so unbound can pick up on it
os.replace('/var/unbound/data/dnsbl.json.new', '/var/unbound/data/dnsbl.json')
syslog.syslog(syslog.LOG_NOTICE, "blocklist download done in %0.2f seconds (%d records)" % (
time.time() - startup_time, len(blocklist_items['data'])
))
bl = BlocklistParser()
bl.update_blocklist()

View File

@ -0,0 +1,265 @@
#!/usr/local/bin/python3
"""
Copyright (c) 2023 Deciso B.V.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"""
import os
import requests
import syslog
import re
import glob
import importlib
import sys
import fcntl
import ujson
import time
import hashlib
from configparser import ConfigParser
class BaseBlocklistHandler:
def __init__(self, config=None):
self.config = config
self.cnf = None
self.priority = 0
self.cur_bl_location = '/var/unbound/data/dnsbl.json'
self.domain_pattern = re.compile(
r'^(([\da-zA-Z_])([_\w-]{,62})\.){,127}(([\da-zA-Z])[_\w-]{,61})'
r'?([\da-zA-Z]\.((xn\-\-[a-zA-Z\d]+)|([a-zA-Z\d]{2,})))$'
)
self._load_config()
def get_config(self):
"""
Get statically defined configuration options.
"""
pass
def get_blocklist(self):
"""
Overridden by derived classes to produce a formatted blocklist. Returns a dictionary
with domains as keys and a dictionary of metadata as values
"""
pass
def _load_config(self):
"""
Load a configuration. If a cached version exists, will also provide a dictionary
containing the differences between the two.
"""
if os.path.exists(self.config):
self.cnf = ConfigParser()
self.cnf.read(self.config)
def _blocklists_in_config(self):
"""
Generator for derived classes to iterate over configured blocklist urls.
Assumes <shortcode>=<url> configuration formatting
"""
if self.cnf and self.cnf.has_section('blocklists'):
for blocklist in self.cnf['blocklists']:
list_type = blocklist.split('_', 1)
bl_shortcode = 'Custom' if list_type[0] == 'custom' else list_type[1]
yield (self.cnf['blocklists'][blocklist], bl_shortcode)
def _domains_in_blocklist(self, blocklist):
"""
Generator for derived classes to iterate over downloaded domains.
"""
for line in self._blocklist_reader(blocklist):
# cut line into parts before comment marker (if any)
tmp = line.split('#')[0].split()
entry = None
while tmp:
entry = tmp.pop(-1)
if entry not in ['127.0.0.1', '0.0.0.0']:
break
if entry:
yield entry
def _blocklist_reader(self, uri):
"""
Decides whether a blocklist can be read from a cached file or
needs to be downloaded. Yields (unformatted) domains either way
"""
total_lines = 0
from_cache = False
h = hashlib.md5(uri.encode()).hexdigest()
cache_loc = '/tmp/bl_cache/'
if os.path.exists(cache_loc):
filep = cache_loc + h
if os.path.exists(filep):
fstat = os.stat(filep).st_ctime
if (time.time() - fstat) < 72000: # 20 hours, a bit under the recommended cron time
from_cache = True
for line in open(filep):
total_lines += 1
yield line
if not from_cache:
os.makedirs(cache_loc, exist_ok=True)
with open(cache_loc + h, 'w') as outf:
for line in self._uri_reader(uri):
outf.write(line + '\n')
total_lines += 1
yield line
syslog.syslog(
syslog.LOG_NOTICE, 'blocklist download: %d total lines %s for %s' %
(total_lines, 'from cache' if from_cache else 'downloaded', uri)
)
def _uri_reader(self, uri):
"""
Takes a URI and yields domain entries.
"""
req_opts = {
'url': uri,
'timeout': 5,
'stream': True
}
try:
req = requests.get(**req_opts)
except Exception as e:
syslog.syslog(syslog.LOG_ERR,'blocklist download : unable to download file from %s (error : %s)' % (uri, e))
return
if req.status_code >= 200 and req.status_code <= 299:
req.raw.decode_content = True
prev_chop = ''
while True:
try:
chop = req.raw.read(1024).decode()
if not chop:
if prev_chop:
yield prev_chop
break
else:
parts = (prev_chop + chop).split('\n')
if parts[-1] != "\n":
prev_chop = parts.pop()
else:
prev_chop = ''
for part in parts:
yield part
except Exception as e:
syslog.syslog(syslog.LOG_ERR,'blocklist download : error reading file from %s (error : %s)' % (uri, e))
return
else:
syslog.syslog(syslog.LOG_ERR,
'blocklist download : unable to download file from %s (status_code: %d)' % (uri, req.status_code)
)
class BlocklistParser:
def __init__(self):
# check for a running download process, this may take a while so it's better to check...
try:
lck = open('/tmp/unbound-download_blocklists.tmp', 'w+')
fcntl.flock(lck, fcntl.LOCK_EX | fcntl.LOCK_NB)
except IOError:
# already running, exit status 99
sys.exit(99)
syslog.openlog('unbound', facility=syslog.LOG_LOCAL4)
self.handlers = list()
self._register_handlers()
self.startup_time = time.time()
def _register_handlers(self):
handlers = list()
for filename in glob.glob("%s/*.py" % os.path.dirname(__file__)):
importlib.import_module(".%s" % os.path.splitext(os.path.basename(filename))[0], __name__)
for module_name in dir(sys.modules[__name__]):
for attribute_name in dir(getattr(sys.modules[__name__], module_name)):
cls = getattr(getattr(sys.modules[__name__], module_name), attribute_name)
if isinstance(cls, type) and issubclass(cls, BaseBlocklistHandler)\
and cls not in (BaseBlocklistHandler,):
handlers.append(cls())
self.handlers = handlers
def _get_config(self):
for handler in self.handlers:
cfg = handler.get_config()
if cfg:
return cfg
def _merge_results(self, blocklists):
"""
Take output of all the handlers and merge based on each handlers' priority.
The default handler has highest priority
"""
if len(blocklists) == 1:
return next(iter(blocklists.values()))
blocklists = dict(sorted(blocklists.items(), reverse=True))
first = next(iter(blocklists.values()))
for bl in list(blocklists.values())[1:]:
for key, value in bl.items():
if key not in first:
# no collision, merge
first[key] = value
else:
# a handler with a lower priority has provided a policy
# on a domain that already exists in the blocklist,
# add it for debugging purposes
first[key].setdefault('collisions', []).append(value)
return first
def update_blocklist(self):
blocklists = {}
merged = {}
for handler in self.handlers:
blocklists[handler.priority] = handler.get_blocklist()
merged['data'] = self._merge_results(blocklists)
merged['config'] = self._get_config()
# check if there are wildcards in the dataset
has_wildcards = False
for item in merged['data']:
if merged['data'][item].get('wildcard') == True:
has_wildcards = True
break
merged['config']['has_wildcards'] = has_wildcards
# write out results
if not os.path.exists('/var/unbound/data'):
os.makedirs('/var/unbound/data')
with open("/var/unbound/data/dnsbl.json.new", 'w') as unbound_outf:
if merged:
ujson.dump(merged, unbound_outf)
# atomically replace the current dnsbl so unbound can pick up on it
os.replace('/var/unbound/data/dnsbl.json.new', '/var/unbound/data/dnsbl.json')
syslog.syslog(syslog.LOG_NOTICE, "blocklist parsing done in %0.2f seconds (%d records)" % (
time.time() - self.startup_time, len(merged['data'])
))

View File

@ -0,0 +1,112 @@
#!/usr/local/bin/python3
"""
Copyright (c) 2023 Deciso B.V.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"""
import syslog
import re
import os
import ujson
from . import BaseBlocklistHandler
class DefaultBlocklistHandler(BaseBlocklistHandler):
def __init__(self):
super().__init__('/tmp/unbound-blocklists.conf')
self.priority = 100
self._whitelist_pattern = self._get_excludes()
def get_config(self):
cfg = {}
if self.cnf and self.cnf.has_section('settings'):
if self.cnf.has_option('settings', 'address'):
cfg['dst_addr'] = self.cnf.get('settings', 'address')
if self.cnf.has_option('settings', 'rcode'):
cfg['rcode'] = self.cnf.get('settings', 'rcode')
return cfg
def get_blocklist(self):
result = {}
for blocklist, bl_shortcode in self._blocklists_in_config():
per_file_stats = {'uri': blocklist, 'skip': 0, 'blocklist': 0}
for entry in self._domains_in_blocklist(blocklist):
domain = entry.lower()
if self._whitelist_pattern.match(entry):
per_file_stats['skip'] += 1
else:
if self.domain_pattern.match(domain):
per_file_stats['blocklist'] += 1
if entry in result:
# duplicate domain, signify in dataset for debugging purposes
if 'duplicates' in result[entry]:
result[entry]['duplicates'] += ',%s' % bl_shortcode
else:
result[entry]['duplicates'] = '%s' % bl_shortcode
else:
result[entry] = {'bl': bl_shortcode, 'wildcard': False}
else:
per_file_stats['skip'] += 1
syslog.syslog(
syslog.LOG_NOTICE,
'blocklist: %(uri)s (exclude: %(skip)d block: %(blocklist)d)' % per_file_stats
)
if self.cnf and self.cnf.has_section('include'):
for key, value in self.cnf['include'].items():
if key.startswith('custom'):
entry = value.rstrip().lower()
if not self._whitelist_pattern.match(entry):
if self.domain_pattern.match(entry):
result[entry] = {'bl': 'Manual', 'wildcard': False}
elif key.startswith('wildcard'):
if self.domain_pattern.match(value):
# do not apply whitelist to wildcard domains
result[value] = {'bl': 'Manual', 'wildcard': True}
return result
def _get_excludes(self):
whitelist_pattern = re.compile('$^') # match nothing
if self.cnf.has_section('exclude'):
exclude_list = set()
for exclude_item in self.cnf['exclude']:
pattern = self.cnf['exclude'][exclude_item]
try:
re.compile(pattern, re.IGNORECASE)
exclude_list.add(pattern)
except re.error:
syslog.syslog(syslog.LOG_ERR,
'blocklist download : skip invalid whitelist exclude pattern "%s" (%s)' % (
exclude_item, pattern
)
)
if not exclude_list:
exclude_list.add('$^')
wp = '|'.join(exclude_list)
whitelist_pattern = re.compile(wp, re.IGNORECASE)
syslog.syslog(syslog.LOG_NOTICE, 'blocklist download : exclude domains matching %s' % wp)
return whitelist_pattern