scrape vorbereitungen

This commit is contained in:
Daniel Grams 2020-07-19 14:02:54 +02:00
parent e2c7c8be8b
commit 8ed23feaf3
3 changed files with 122 additions and 0 deletions

1
.gitignore vendored
View File

@ -4,6 +4,7 @@ __pycache__/
*$py.class
.env
tmp/
# C extensions
*.so

View File

@ -1,6 +1,7 @@
alembic==1.4.2
Babel==2.8.0
bcrypt==3.1.7
beautifulsoup4==4.9.1
blinker==1.4
certifi==2020.6.20
cffi==1.14.0
@ -39,6 +40,7 @@ pytz==2020.1
requests==2.24.0
requests-oauthlib==1.3.0
six==1.15.0
soupsieve==2.0.1
speaklater==1.3
SQLAlchemy==1.3.17
SQLAlchemy-Utils==0.36.6
@ -47,3 +49,4 @@ URLObject==2.4.3
visitor==0.1.3
Werkzeug==1.0.1
WTForms==2.3.1
WTForms-SQLAlchemy==0.1

118
scrape_fp.py Normal file
View File

@ -0,0 +1,118 @@
from app import app, db, get_admin_unit
from pprint import pprint
import datetime
from dateutil import parser, tz
import pytz
from urllib.request import urlopen, URLError
from bs4 import BeautifulSoup
import requests
from os import path
import json
import re
import unicodedata
from models import Event, EventStatus, EventCategory, EventPlace, EventOrganizer, AdminUnit
berlin_tz = pytz.timezone('Europe/Berlin')
def scrape(debug = True):
url = 'https://goslar.feripro.de/programm/40/anmeldung/veranstaltungen'
if debug:
filename = "tmp/fp.html"
if not path.exists(filename):
response = urlopen(url)
with open(filename, "wb") as text_file:
text_file.write(response.read())
doc = BeautifulSoup(open(filename), 'html.parser')
else:
response = urlopen(url)
doc = BeautifulSoup(response, 'html.parser')
js_assigns_regex = r"(\w*)\s*:\s*JSON\.parse\('(.*)'\)"
js_assigns = dict()
javascripts = doc.find_all('script')
for javascript in javascripts:
javascript_contents = javascript.contents[0] if len(javascript.contents) > 0 else ''
if 'window.fp_initial' in javascript_contents:
matches = re.findall(js_assigns_regex, javascript_contents, re.MULTILINE)
for match in matches:
key = match[0]
if not key in ['events']:
continue
json_str = match[1]
decoded_json_str = json_str.encode('utf-8').decode('unicode_escape').encode('latin-1').decode('utf-8')
value = json.loads(decoded_json_str, strict=False)
js_assigns[key] = value
break
for js_event in js_assigns['events']:
event = Event()
event.admin_unit = get_admin_unit('Stadt Goslar')
organizer = js_event['organizer']
name = js_event['name']
start = parse_date_time_str(js_event['start'])
end = parse_date_time_str(js_event['end'])
tag_list = js_event['tags']
tag_list.append('Ferienpass')
tags = ','.join(tag_list)
description = js_event['description']
kid_friendly = True
accessible_for_free = js_event['price'] == '0.00'
external_link = url + '#' + str(js_event['event_id'])
if js_event['teaser'] and js_event['teaser'] != js_event['description']:
description = js_event['teaser'] + '\n\n' + description
if js_event['additional_info']:
description = description + '\n\n' + js_event['additional_info']
from_age = None
if js_event['min_age']:
from_age = int(js_event['min_age'])
to_age = None
if js_event['max_age']:
to_age = int(js_event['max_age'])
status = EventStatus.cancelled if js_event['canceled'] else EventStatus.scheduled
meeting_point = js_event['meeting_point'].replace('\r\n', ', ')
host_name = js_event['name_public'] if js_event['name_public'] else organizer['name']
host_phone = js_event['phone_public'] if js_event['phone_public'] else organizer['phone']
host_email = js_event['email_public'] if js_event['email_public'] else organizer['email']
host_url = organizer['website'] if organizer['website'] else organizer['facebook']
print(
external_link,
name,
start,
end,
status,
tags,
from_age,
to_age,
meeting_point,
js_event['meeting_point_latitude'],
js_event['meeting_point_longitude'],
host_name,
host_phone,
host_email,
host_url)
def parse_date_time_str(date_time_str):
if not date_time_str:
return None
date_time = datetime.datetime.fromisoformat(date_time_str)
return berlin_tz.localize(date_time)
if __name__ == '__main__':
scrape()