eventcally/scrape_fp.py
2020-07-19 14:02:54 +02:00

118 lines
3.9 KiB
Python

from app import app, db, get_admin_unit
from pprint import pprint
import datetime
from dateutil import parser, tz
import pytz
from urllib.request import urlopen, URLError
from bs4 import BeautifulSoup
import requests
from os import path
import json
import re
import unicodedata
from models import Event, EventStatus, EventCategory, EventPlace, EventOrganizer, AdminUnit
berlin_tz = pytz.timezone('Europe/Berlin')
def scrape(debug = True):
url = 'https://goslar.feripro.de/programm/40/anmeldung/veranstaltungen'
if debug:
filename = "tmp/fp.html"
if not path.exists(filename):
response = urlopen(url)
with open(filename, "wb") as text_file:
text_file.write(response.read())
doc = BeautifulSoup(open(filename), 'html.parser')
else:
response = urlopen(url)
doc = BeautifulSoup(response, 'html.parser')
js_assigns_regex = r"(\w*)\s*:\s*JSON\.parse\('(.*)'\)"
js_assigns = dict()
javascripts = doc.find_all('script')
for javascript in javascripts:
javascript_contents = javascript.contents[0] if len(javascript.contents) > 0 else ''
if 'window.fp_initial' in javascript_contents:
matches = re.findall(js_assigns_regex, javascript_contents, re.MULTILINE)
for match in matches:
key = match[0]
if not key in ['events']:
continue
json_str = match[1]
decoded_json_str = json_str.encode('utf-8').decode('unicode_escape').encode('latin-1').decode('utf-8')
value = json.loads(decoded_json_str, strict=False)
js_assigns[key] = value
break
for js_event in js_assigns['events']:
event = Event()
event.admin_unit = get_admin_unit('Stadt Goslar')
organizer = js_event['organizer']
name = js_event['name']
start = parse_date_time_str(js_event['start'])
end = parse_date_time_str(js_event['end'])
tag_list = js_event['tags']
tag_list.append('Ferienpass')
tags = ','.join(tag_list)
description = js_event['description']
kid_friendly = True
accessible_for_free = js_event['price'] == '0.00'
external_link = url + '#' + str(js_event['event_id'])
if js_event['teaser'] and js_event['teaser'] != js_event['description']:
description = js_event['teaser'] + '\n\n' + description
if js_event['additional_info']:
description = description + '\n\n' + js_event['additional_info']
from_age = None
if js_event['min_age']:
from_age = int(js_event['min_age'])
to_age = None
if js_event['max_age']:
to_age = int(js_event['max_age'])
status = EventStatus.cancelled if js_event['canceled'] else EventStatus.scheduled
meeting_point = js_event['meeting_point'].replace('\r\n', ', ')
host_name = js_event['name_public'] if js_event['name_public'] else organizer['name']
host_phone = js_event['phone_public'] if js_event['phone_public'] else organizer['phone']
host_email = js_event['email_public'] if js_event['email_public'] else organizer['email']
host_url = organizer['website'] if organizer['website'] else organizer['facebook']
print(
external_link,
name,
start,
end,
status,
tags,
from_age,
to_age,
meeting_point,
js_event['meeting_point_latitude'],
js_event['meeting_point_longitude'],
host_name,
host_phone,
host_email,
host_url)
def parse_date_time_str(date_time_str):
if not date_time_str:
return None
date_time = datetime.datetime.fromisoformat(date_time_str)
return berlin_tz.localize(date_time)
if __name__ == '__main__':
scrape()