mirror of
https://github.com/lucaspalomodevelop/eventcally.git
synced 2026-03-13 00:07:22 +00:00
118 lines
3.9 KiB
Python
118 lines
3.9 KiB
Python
from app import app, db, get_admin_unit
|
|
from pprint import pprint
|
|
import datetime
|
|
from dateutil import parser, tz
|
|
import pytz
|
|
from urllib.request import urlopen, URLError
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
from os import path
|
|
import json
|
|
import re
|
|
import unicodedata
|
|
from models import Event, EventStatus, EventCategory, EventPlace, EventOrganizer, AdminUnit
|
|
|
|
berlin_tz = pytz.timezone('Europe/Berlin')
|
|
|
|
def scrape(debug = True):
|
|
url = 'https://goslar.feripro.de/programm/40/anmeldung/veranstaltungen'
|
|
|
|
if debug:
|
|
filename = "tmp/fp.html"
|
|
|
|
if not path.exists(filename):
|
|
response = urlopen(url)
|
|
with open(filename, "wb") as text_file:
|
|
text_file.write(response.read())
|
|
|
|
doc = BeautifulSoup(open(filename), 'html.parser')
|
|
else:
|
|
response = urlopen(url)
|
|
doc = BeautifulSoup(response, 'html.parser')
|
|
|
|
js_assigns_regex = r"(\w*)\s*:\s*JSON\.parse\('(.*)'\)"
|
|
js_assigns = dict()
|
|
javascripts = doc.find_all('script')
|
|
for javascript in javascripts:
|
|
javascript_contents = javascript.contents[0] if len(javascript.contents) > 0 else ''
|
|
|
|
if 'window.fp_initial' in javascript_contents:
|
|
matches = re.findall(js_assigns_regex, javascript_contents, re.MULTILINE)
|
|
for match in matches:
|
|
key = match[0]
|
|
|
|
if not key in ['events']:
|
|
continue
|
|
|
|
json_str = match[1]
|
|
decoded_json_str = json_str.encode('utf-8').decode('unicode_escape').encode('latin-1').decode('utf-8')
|
|
value = json.loads(decoded_json_str, strict=False)
|
|
js_assigns[key] = value
|
|
break
|
|
|
|
for js_event in js_assigns['events']:
|
|
event = Event()
|
|
event.admin_unit = get_admin_unit('Stadt Goslar')
|
|
|
|
organizer = js_event['organizer']
|
|
|
|
name = js_event['name']
|
|
start = parse_date_time_str(js_event['start'])
|
|
end = parse_date_time_str(js_event['end'])
|
|
tag_list = js_event['tags']
|
|
tag_list.append('Ferienpass')
|
|
tags = ','.join(tag_list)
|
|
description = js_event['description']
|
|
kid_friendly = True
|
|
accessible_for_free = js_event['price'] == '0.00'
|
|
external_link = url + '#' + str(js_event['event_id'])
|
|
|
|
if js_event['teaser'] and js_event['teaser'] != js_event['description']:
|
|
description = js_event['teaser'] + '\n\n' + description
|
|
|
|
if js_event['additional_info']:
|
|
description = description + '\n\n' + js_event['additional_info']
|
|
|
|
from_age = None
|
|
if js_event['min_age']:
|
|
from_age = int(js_event['min_age'])
|
|
|
|
to_age = None
|
|
if js_event['max_age']:
|
|
to_age = int(js_event['max_age'])
|
|
|
|
status = EventStatus.cancelled if js_event['canceled'] else EventStatus.scheduled
|
|
|
|
meeting_point = js_event['meeting_point'].replace('\r\n', ', ')
|
|
|
|
host_name = js_event['name_public'] if js_event['name_public'] else organizer['name']
|
|
host_phone = js_event['phone_public'] if js_event['phone_public'] else organizer['phone']
|
|
host_email = js_event['email_public'] if js_event['email_public'] else organizer['email']
|
|
host_url = organizer['website'] if organizer['website'] else organizer['facebook']
|
|
|
|
print(
|
|
external_link,
|
|
name,
|
|
start,
|
|
end,
|
|
status,
|
|
tags,
|
|
from_age,
|
|
to_age,
|
|
meeting_point,
|
|
js_event['meeting_point_latitude'],
|
|
js_event['meeting_point_longitude'],
|
|
host_name,
|
|
host_phone,
|
|
host_email,
|
|
host_url)
|
|
|
|
def parse_date_time_str(date_time_str):
|
|
if not date_time_str:
|
|
return None
|
|
|
|
date_time = datetime.datetime.fromisoformat(date_time_str)
|
|
return berlin_tz.localize(date_time)
|
|
|
|
if __name__ == '__main__':
|
|
scrape() |