Moved to single file

2024-02-26 13:02:43 +00:00
parent 0a04d8f68f
commit 664c7c46a0
5 changed files with 163 additions and 100 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,4 @@ cython_debug/
 data.json
 *.osm
 *.gpx
 cache/
--- a/fetch_data.py
+++ b/fetch_data.py
@@ -1,69 +0,0 @@
 import re
 from io import StringIO
 import csv
 import json
 import requests
 def get_megalithic_data(country=1):
    def megalithic_url(country):
        return 'https://www.megalithic.co.uk/cache/csvmap_country{}.csv'.format(country)
    # Megalithic doesn't really want people scraping
    response = requests.get(
        megalithic_url(country),
        headers={
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0'
        }
    )
    if response.status_code != 200:
        raise Exception('Failed to fetch data from Megalithic.UK: {}'.format(response.text))
    content = StringIO(response.text)
    reader = csv.DictReader(content, delimiter='|')
    data = []
    for row in reader:
        data.append({
            'lat': row['lat'],
            'lng': row['lng'],
            'name': row['Name'],
            'type': row['Type'],
            'url': 'https://megalithic.co.uk/article.php?sid={}'.format(row['SID']),
        })
    return data
 def get_stone_circles_data():
    response = requests.get('http://www.stone-circles.org.uk/stone/Cluster/Coords/coords.js')
    if response.status_code != 200:
        raise Exception('Failed to fetch data from stone-circles.org.uk: {}'.format(response.text))
    content = re.match(
        r'.+ = \[(\[.+]),?];',
        response.text.replace('\n', '')
    )
    content = re.sub(r'\\(?!")', '', content.groups()[0])
    arr = json.loads('[{}]'.format(content))
    data = []
    for item in arr:
        data.append({
            'lat': item[0],
            'lng': item[1],
            'name': re.sub(r'<.+?>', '', re.match(r'<b>(.+)</b>', item[2]).groups()[0]),
            'type': re.sub(r'.+>', '', item[2].replace('<br>', ' ')),
            'url': 'http://www.stone-circles.org.uk/stone/{}'.format(re.search(r'href=([a-zA-Z.]+)', item[2]).groups()[0]),
        })
    return data
 if __name__ == '__main__':
    all_data = get_stone_circles_data() # + get_megalithic_data()
    with open('data.json', 'w') as f:
        json.dump(all_data, f)
--- a/gpx.py
+++ b/gpx.py
@@ -0,0 +1,161 @@
 import csv
 import json
 import argparse
 from html import escape
 from pathlib import Path
 import re
 import requests
 parser = argparse.ArgumentParser(
    prog="megalithosm",
    description="Fetch data from megalith sources and produce a GPX file",
 )
 parser.add_argument(
    "-q",
    "--quality",
    default=5,
    type=int,
    help="Include sites of this quality or higher",
 )
 parser.add_argument("-r", "--refetch", action="store_true", default=False)
 args = parser.parse_args()
 country_range = 6
 def cache_stone_circles():
    response = requests.get(
        "http://www.stone-circles.org.uk/stone/Cluster/Coords/coords.js"
    )
    if response.status_code != 200:
        raise Exception(
            "Failed to fetch data from stone-circles.org.uk: {}".format(response.text)
        )
    Path("cache").mkdir(exist_ok=True)
    with open("cache/stone-circles.js", "w") as f:
        f.write(response.text)
 def cache_megalithic():
    def megalithic_url(country):
        return "https://www.megalithic.co.uk/cache/csvmap_country{}.csv".format(country)
    for country in range(1, country_range):
        # Megalithic doesn't really want people scraping
        response = requests.get(
            megalithic_url(country),
            headers={
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
            },
        )
        if response.status_code != 200:
            raise Exception(
                "Failed to fetch data from Megalithic.UK: {}".format(response.text)
            )
        Path("cache").mkdir(exist_ok=True)
        with open("cache/megalithic-{}.csv".format(country), "w") as f:
            f.write(response.text)
 if args.refetch or not Path("cache/stone-circles.js").exists():
    print("Refreshing stone-circles.org.uk cache")
    cache_stone_circles()
 if args.refetch or not all(
    Path("cache/megalithic-{}.csv".format(c)).exists() for c in range(1, country_range)
 ):
    print("Refreshing megalithic.co.uk cache")
    cache_megalithic()
 print("Post-processing data")
 data = []
 for country in range(1, country_range):
    with open("cache/megalithic-{}.csv".format(country)) as f:
        reader = csv.DictReader(f, delimiter="|")
        types = set()
        for row in reader:
            types.add(row["Type"])
            data.append(
                {
                    "lat": row["lat"],
                    "lng": row["lng"],
                    "name": row["Name"],
                    "type": row["Type"],
                    "url": "https://megalithic.co.uk/article.php?sid={}".format(
                        row["SID"]
                    ),
                    "quality": int(row["Condition"]),
                }
            )
    with open("cache/stone-circles.js") as f:
        content = f.read()
        content = re.match(r".+ = \[(\[.+]),?];", content.replace("\n", ""))
        content = re.sub(r'\\(?!")', "", content.groups()[0])
        arr = json.loads("[{}]".format(content))
        for item in arr:
            data.append(
                {
                    "lat": item[0],
                    "lng": item[1],
                    "name": re.sub(
                        r"<.+?>", "", re.match(r"<b>(.+)</b>", item[2]).groups()[0]
                    ),
                    "type": re.sub(r".+>", "", item[2].replace("<br>", " ")),
                    "url": "http://www.stone-circles.org.uk/stone/{}".format(
                        re.search(r"href=([a-zA-Z.]+)", item[2]).groups()[0]
                    ),
                    "quality": 5,
                }
            )
 print("Generating GPX")
 with open("Megaliths.gpx", "w") as gpx_file:
    gpx_file.write(
        """<?xml version="1.0" encoding="UTF-8"?>
 <gpx xmlns="http://www.topografix.com/GPX/1/1" 
     version="1.1" 
     creator="megalithosm"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://www.topografix.com/GPX/1/1 http://www.topografix.com/GPX/1/1/gpx.xsd">
    <metadata>
        <name>Megalith sites</name>
        <author>
            <name>Jude Southworth</name>
        </author>
    </metadata>"""
    )
    seen_sites = set()
    for poi in data:
        norm_name = re.sub(r"\s", "", poi["name"]).lower()
        site_key = "{},{},{}".format(poi["lat"], poi["lng"], norm_name)
        if site_key in seen_sites:
            print("Omitting duplicate site: {}".format(poi["name"]))
            continue
        if poi["quality"] < args.quality:
            continue
        if poi["type"] in ["Museum", "Modern Stone Circle etc"]:
            print("Omitting uninteresting feature: {}".format(poi["name"]))
            continue
        # Deduplicate entries
        seen_sites.add(site_key)
        name = "{} ({})".format(poi["name"], poi["type"].strip())
        gpx_file.write(
            """
    <wpt lat="{}" lon="{}">
        <name>{}</name>
        <desc>{}</desc>
    </wpt>""".format(
                poi["lat"], poi["lng"], escape(name), escape(poi["url"])
            )
        )
    gpx_file.write("""\n</gpx>""")
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 requests~=2.31
 pandas~=2.2
 black~=24.2
--- a/to_gpx.py
+++ b/to_gpx.py
@@ -1,31 +0,0 @@
 import json
 from html import escape
 with open('data.json') as data_file:
    data = json.load(data_file)
 with open('Megaliths.gpx', 'w') as gpx_file:
    gpx_file.write('''<?xml version="1.0" encoding="UTF-8"?>
 <gpx xmlns="http://www.topografix.com/GPX/1/1" 
     version="1.1" 
     creator="megalithosm" 
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
     xsi:schemaLocation="http://www.topografix.com/GPX/1/1 http://www.topografix.com/GPX/1/1/gpx.xsd">
    <metadata>
        <name>Megalith sites</name>
        <author>
           <name>Jude Southworth</name>
        </author>
    </metadata>''')
    for poi in data:
        name = '{} ({})'.format(poi['name'], poi['type'].strip())
        gpx_file.write(
            '''
    <wpt lat="{}" lon="{}">
        <name>{}</name>
        <desc>{}</desc>
    </wpt>'''.format(poi['lat'], poi['lng'], escape(name), poi['url'])
        )
    gpx_file.write('</gpx>')