mirror of
https://github.com/lucaspalomodevelop/eventcally.git
synced 2026-03-13 00:07:22 +00:00
scrape vorbereitungen
This commit is contained in:
parent
e2c7c8be8b
commit
8ed23feaf3
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,6 +4,7 @@ __pycache__/
|
||||
*$py.class
|
||||
|
||||
.env
|
||||
tmp/
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
alembic==1.4.2
|
||||
Babel==2.8.0
|
||||
bcrypt==3.1.7
|
||||
beautifulsoup4==4.9.1
|
||||
blinker==1.4
|
||||
certifi==2020.6.20
|
||||
cffi==1.14.0
|
||||
@ -39,6 +40,7 @@ pytz==2020.1
|
||||
requests==2.24.0
|
||||
requests-oauthlib==1.3.0
|
||||
six==1.15.0
|
||||
soupsieve==2.0.1
|
||||
speaklater==1.3
|
||||
SQLAlchemy==1.3.17
|
||||
SQLAlchemy-Utils==0.36.6
|
||||
@ -47,3 +49,4 @@ URLObject==2.4.3
|
||||
visitor==0.1.3
|
||||
Werkzeug==1.0.1
|
||||
WTForms==2.3.1
|
||||
WTForms-SQLAlchemy==0.1
|
||||
|
||||
118
scrape_fp.py
Normal file
118
scrape_fp.py
Normal file
@ -0,0 +1,118 @@
|
||||
from app import app, db, get_admin_unit
|
||||
from pprint import pprint
|
||||
import datetime
|
||||
from dateutil import parser, tz
|
||||
import pytz
|
||||
from urllib.request import urlopen, URLError
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from os import path
|
||||
import json
|
||||
import re
|
||||
import unicodedata
|
||||
from models import Event, EventStatus, EventCategory, EventPlace, EventOrganizer, AdminUnit
|
||||
|
||||
berlin_tz = pytz.timezone('Europe/Berlin')
|
||||
|
||||
def scrape(debug = True):
|
||||
url = 'https://goslar.feripro.de/programm/40/anmeldung/veranstaltungen'
|
||||
|
||||
if debug:
|
||||
filename = "tmp/fp.html"
|
||||
|
||||
if not path.exists(filename):
|
||||
response = urlopen(url)
|
||||
with open(filename, "wb") as text_file:
|
||||
text_file.write(response.read())
|
||||
|
||||
doc = BeautifulSoup(open(filename), 'html.parser')
|
||||
else:
|
||||
response = urlopen(url)
|
||||
doc = BeautifulSoup(response, 'html.parser')
|
||||
|
||||
js_assigns_regex = r"(\w*)\s*:\s*JSON\.parse\('(.*)'\)"
|
||||
js_assigns = dict()
|
||||
javascripts = doc.find_all('script')
|
||||
for javascript in javascripts:
|
||||
javascript_contents = javascript.contents[0] if len(javascript.contents) > 0 else ''
|
||||
|
||||
if 'window.fp_initial' in javascript_contents:
|
||||
matches = re.findall(js_assigns_regex, javascript_contents, re.MULTILINE)
|
||||
for match in matches:
|
||||
key = match[0]
|
||||
|
||||
if not key in ['events']:
|
||||
continue
|
||||
|
||||
json_str = match[1]
|
||||
decoded_json_str = json_str.encode('utf-8').decode('unicode_escape').encode('latin-1').decode('utf-8')
|
||||
value = json.loads(decoded_json_str, strict=False)
|
||||
js_assigns[key] = value
|
||||
break
|
||||
|
||||
for js_event in js_assigns['events']:
|
||||
event = Event()
|
||||
event.admin_unit = get_admin_unit('Stadt Goslar')
|
||||
|
||||
organizer = js_event['organizer']
|
||||
|
||||
name = js_event['name']
|
||||
start = parse_date_time_str(js_event['start'])
|
||||
end = parse_date_time_str(js_event['end'])
|
||||
tag_list = js_event['tags']
|
||||
tag_list.append('Ferienpass')
|
||||
tags = ','.join(tag_list)
|
||||
description = js_event['description']
|
||||
kid_friendly = True
|
||||
accessible_for_free = js_event['price'] == '0.00'
|
||||
external_link = url + '#' + str(js_event['event_id'])
|
||||
|
||||
if js_event['teaser'] and js_event['teaser'] != js_event['description']:
|
||||
description = js_event['teaser'] + '\n\n' + description
|
||||
|
||||
if js_event['additional_info']:
|
||||
description = description + '\n\n' + js_event['additional_info']
|
||||
|
||||
from_age = None
|
||||
if js_event['min_age']:
|
||||
from_age = int(js_event['min_age'])
|
||||
|
||||
to_age = None
|
||||
if js_event['max_age']:
|
||||
to_age = int(js_event['max_age'])
|
||||
|
||||
status = EventStatus.cancelled if js_event['canceled'] else EventStatus.scheduled
|
||||
|
||||
meeting_point = js_event['meeting_point'].replace('\r\n', ', ')
|
||||
|
||||
host_name = js_event['name_public'] if js_event['name_public'] else organizer['name']
|
||||
host_phone = js_event['phone_public'] if js_event['phone_public'] else organizer['phone']
|
||||
host_email = js_event['email_public'] if js_event['email_public'] else organizer['email']
|
||||
host_url = organizer['website'] if organizer['website'] else organizer['facebook']
|
||||
|
||||
print(
|
||||
external_link,
|
||||
name,
|
||||
start,
|
||||
end,
|
||||
status,
|
||||
tags,
|
||||
from_age,
|
||||
to_age,
|
||||
meeting_point,
|
||||
js_event['meeting_point_latitude'],
|
||||
js_event['meeting_point_longitude'],
|
||||
host_name,
|
||||
host_phone,
|
||||
host_email,
|
||||
host_url)
|
||||
|
||||
def parse_date_time_str(date_time_str):
|
||||
if not date_time_str:
|
||||
return None
|
||||
|
||||
date_time = datetime.datetime.fromisoformat(date_time_str)
|
||||
return berlin_tz.localize(date_time)
|
||||
|
||||
if __name__ == '__main__':
|
||||
scrape()
|
||||
Loading…
x
Reference in New Issue
Block a user