diff --git a/.gitignore b/.gitignore index 4be97fb..b69d282 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__/ *$py.class .env +tmp/ # C extensions *.so diff --git a/requirements.txt b/requirements.txt index 95e5f37..a0ea8da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ alembic==1.4.2 Babel==2.8.0 bcrypt==3.1.7 +beautifulsoup4==4.9.1 blinker==1.4 certifi==2020.6.20 cffi==1.14.0 @@ -39,6 +40,7 @@ pytz==2020.1 requests==2.24.0 requests-oauthlib==1.3.0 six==1.15.0 +soupsieve==2.0.1 speaklater==1.3 SQLAlchemy==1.3.17 SQLAlchemy-Utils==0.36.6 @@ -47,3 +49,4 @@ URLObject==2.4.3 visitor==0.1.3 Werkzeug==1.0.1 WTForms==2.3.1 +WTForms-SQLAlchemy==0.1 diff --git a/scrape_fp.py b/scrape_fp.py new file mode 100644 index 0000000..0a2c74e --- /dev/null +++ b/scrape_fp.py @@ -0,0 +1,118 @@ +from app import app, db, get_admin_unit +from pprint import pprint +import datetime +from dateutil import parser, tz +import pytz +from urllib.request import urlopen, URLError +from bs4 import BeautifulSoup +import requests +from os import path +import json +import re +import unicodedata +from models import Event, EventStatus, EventCategory, EventPlace, EventOrganizer, AdminUnit + +berlin_tz = pytz.timezone('Europe/Berlin') + +def scrape(debug = True): + url = 'https://goslar.feripro.de/programm/40/anmeldung/veranstaltungen' + + if debug: + filename = "tmp/fp.html" + + if not path.exists(filename): + response = urlopen(url) + with open(filename, "wb") as text_file: + text_file.write(response.read()) + + doc = BeautifulSoup(open(filename), 'html.parser') + else: + response = urlopen(url) + doc = BeautifulSoup(response, 'html.parser') + + js_assigns_regex = r"(\w*)\s*:\s*JSON\.parse\('(.*)'\)" + js_assigns = dict() + javascripts = doc.find_all('script') + for javascript in javascripts: + javascript_contents = javascript.contents[0] if len(javascript.contents) > 0 else '' + + if 'window.fp_initial' in javascript_contents: + matches = re.findall(js_assigns_regex, javascript_contents, re.MULTILINE) + for match in matches: + key = match[0] + + if not key in ['events']: + continue + + json_str = match[1] + decoded_json_str = json_str.encode('utf-8').decode('unicode_escape').encode('latin-1').decode('utf-8') + value = json.loads(decoded_json_str, strict=False) + js_assigns[key] = value + break + + for js_event in js_assigns['events']: + event = Event() + event.admin_unit = get_admin_unit('Stadt Goslar') + + organizer = js_event['organizer'] + + name = js_event['name'] + start = parse_date_time_str(js_event['start']) + end = parse_date_time_str(js_event['end']) + tag_list = js_event['tags'] + tag_list.append('Ferienpass') + tags = ','.join(tag_list) + description = js_event['description'] + kid_friendly = True + accessible_for_free = js_event['price'] == '0.00' + external_link = url + '#' + str(js_event['event_id']) + + if js_event['teaser'] and js_event['teaser'] != js_event['description']: + description = js_event['teaser'] + '\n\n' + description + + if js_event['additional_info']: + description = description + '\n\n' + js_event['additional_info'] + + from_age = None + if js_event['min_age']: + from_age = int(js_event['min_age']) + + to_age = None + if js_event['max_age']: + to_age = int(js_event['max_age']) + + status = EventStatus.cancelled if js_event['canceled'] else EventStatus.scheduled + + meeting_point = js_event['meeting_point'].replace('\r\n', ', ') + + host_name = js_event['name_public'] if js_event['name_public'] else organizer['name'] + host_phone = js_event['phone_public'] if js_event['phone_public'] else organizer['phone'] + host_email = js_event['email_public'] if js_event['email_public'] else organizer['email'] + host_url = organizer['website'] if organizer['website'] else organizer['facebook'] + + print( + external_link, + name, + start, + end, + status, + tags, + from_age, + to_age, + meeting_point, + js_event['meeting_point_latitude'], + js_event['meeting_point_longitude'], + host_name, + host_phone, + host_email, + host_url) + +def parse_date_time_str(date_time_str): + if not date_time_str: + return None + + date_time = datetime.datetime.fromisoformat(date_time_str) + return berlin_tz.localize(date_time) + +if __name__ == '__main__': + scrape() \ No newline at end of file