eventcally/project/scrape/scrape_hi.py
Daniel Grams 273b3fb072 Internal/analysis (#4)
* Codestyle Black

* Flake8

* Test coverage
2020-11-13 23:05:39 +01:00

219 lines
7.3 KiB
Python

from project import db
from pprint import pprint
import datetime
from urllib import request
from os import path
import json
import decimal
from project.models import (
EventReviewStatus,
Location,
Event,
EventStatus,
EventPlace,
EventOrganizer,
)
from sqlalchemy import and_, not_
from project.services.admin_unit import get_admin_unit
from project.services.event import (
upsert_event_category,
update_event_dates_with_recurrence_rule,
)
admin_unit = get_admin_unit("Harzinfo")
category = upsert_event_category("Other")
base_url = "https://www.harzinfo.de"
url = base_url + "/?ndssearch=fullsearch&no_cache=1&L=0"
with open("scrape_hi_req.json") as json_file:
request_object = json.load(json_file)
with open("scrape_hi_cities.json") as json_file:
cities = json.load(json_file)
def response_from_url(city):
body = request_object
body["searchFilter"]["ndsdestinationdataevent"]["city"] = {
str(city["id"]): city["short_name"] or city["title"]
}
req = request.Request(url, data=bytes(json.dumps(body), encoding="utf-8"))
req.add_header("Content-Type", "application/json")
return request.urlopen(req)
def load_json(debug, city):
if debug:
filename = "tmp/hi_%d.html" % (city["id"])
if not path.exists(filename):
response = response_from_url(city)
with open(filename, "wb") as text_file:
text_file.write(response.read())
with open(filename) as json_file:
return json.load(json_file)
else:
response = response_from_url(city)
return json.load(response)
def parse_date_time_str(date_time_str):
if not date_time_str:
return None
return datetime.datetime.fromisoformat(date_time_str + ":00")
def scrape(debug, city):
# Organizer
organizer_name = city["short_name"] or city["title"]
organizer = EventOrganizer.query.filter(
and_(
EventOrganizer.admin_unit_id == admin_unit.id,
EventOrganizer.name == organizer_name,
)
).first()
if organizer is None:
organizer = EventOrganizer(admin_unit_id=admin_unit.id, name=organizer_name)
db.session.add(organizer)
db.session.commit()
print(organizer_name)
response = load_json(debug, city)
result = response["result"]
event_ids = list()
for item in result:
try:
uid = str(item["uid"])
external_link = base_url + item["link"] + "#" + uid
event = Event.query.filter(
and_(
Event.organizer_id == organizer.id,
Event.external_link == external_link,
)
).first()
did_create = False
if event is None:
event = Event()
event.admin_unit = admin_unit
event.organizer = organizer
did_create = True
event_ids.append(event.id)
event.category = category
event.external_link = external_link
event.review_status = EventReviewStatus.verified
event.rating = int(item["rating"])
event.name = item["title"]
event.description = item["title"]
start = parse_date_time_str(item["date"])
update_event_dates_with_recurrence_rule(event, start, None)
# Place
place_name = item["location"]
place_description = ""
place_location = None
if "latitude" in item and "longitude" in item:
meeting_point_latitude = item["latitude"]
meeting_point_longitude = item["longitude"]
if meeting_point_latitude and meeting_point_longitude:
latitude = decimal.Decimal(meeting_point_latitude)
longitude = decimal.Decimal(meeting_point_longitude)
if latitude != 0 and longitude != 0:
place_location = Location()
place_location.latitude = latitude
place_location.longitude = longitude
place = EventPlace.query.filter(
and_(
EventPlace.admin_unit_id == admin_unit.id,
EventPlace.organizer_id == organizer.id,
EventPlace.name == place_name,
)
).first()
if place is None:
place = EventPlace(
admin_unit_id=admin_unit.id,
organizer_id=organizer.id,
name=place_name,
)
place.description = place_description
place.location = place_location
event.event_place = place
# Additional data
event.status = (
EventStatus.cancelled if item["canceled"] else EventStatus.scheduled
)
if "categories" in item:
tag_list = list(item["categories"].values())
if "Ausstellung/Kunst" in tag_list:
event.category = upsert_event_category("Art")
elif "Comedy" in tag_list:
event.category = upsert_event_category("Comedy")
elif "Konzert/Musik" in tag_list:
event.category = upsert_event_category("Music")
elif "Theater" in tag_list:
event.category = upsert_event_category("Theater")
elif "Genuss/Gourmet" in tag_list:
event.category = upsert_event_category("Dining")
elif "Gesundheit/Wellness" in tag_list:
event.category = upsert_event_category("Fitness")
elif "Kinder/Jugend" in tag_list:
event.category = upsert_event_category("Family")
elif "Markt/Flohmarkt" in tag_list:
event.category = upsert_event_category("Shopping")
elif "Sport" in tag_list:
event.category = upsert_event_category("Sports")
elif "Vortrag/Lesung" in tag_list:
event.category = upsert_event_category("Book")
elif "Kabarett" in tag_list:
event.category = upsert_event_category("Art")
elif "Musical" in tag_list:
event.category = upsert_event_category("Theater")
elif "Weihnachtsmärkte" in tag_list:
event.category = upsert_event_category("Festival")
elif "Stadt- und Volksfeste" in tag_list:
event.category = upsert_event_category("Festival")
if "Kinder/Jugend" in tag_list:
event.kid_friendly = True
tag_list.append("Harzinfo")
event.tags = ",".join(tag_list)
print("%s %s %d" % (event.dates[0].start, event.name, event.rating))
if did_create:
db.session.add(event)
db.session.commit()
except Exception:
print("Exception")
pprint(item)
Event.query.filter(
and_(
Event.admin_unit_id == admin_unit.id,
Event.organizer_id == organizer.id,
not_(Event.id.in_(event_ids)),
)
).delete(synchronize_session="fetch")
db.session.commit()
if __name__ == "__main__":
for city in cities.values():
scrape(False, city)