From 664c7c46a00a3e41ca99389514e318eac6edb9c2 Mon Sep 17 00:00:00 2001
From: jude <jude@jellywx.com>
Date: Mon, 26 Feb 2024 13:02:43 +0000
Subject: [PATCH] Moved to single file

---
 .gitignore       |   1 +
 fetch_data.py    |  69 --------------------
 gpx.py           | 161 +++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |   1 +
 to_gpx.py        |  31 ---------
 5 files changed, 163 insertions(+), 100 deletions(-)
 delete mode 100644 fetch_data.py
 create mode 100644 gpx.py
 delete mode 100644 to_gpx.py
diff --git a/.gitignore b/.gitignore
index 2518f65..f4b2f54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,4 @@ cython_debug/
 data.json
 *.osm
 *.gpx
+cache/
diff --git a/fetch_data.py b/fetch_data.py
deleted file mode 100644
index cb85128..0000000
--- a/fetch_data.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import re
-from io import StringIO
-import csv
-import json
-
-import requests
-
-
-def get_megalithic_data(country=1):
-    def megalithic_url(country):
-        return 'https://www.megalithic.co.uk/cache/csvmap_country{}.csv'.format(country)
-
-    # Megalithic doesn't really want people scraping
-    response = requests.get(
-        megalithic_url(country),
-        headers={
-            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0'
-        }
-    )
-    if response.status_code != 200:
-        raise Exception('Failed to fetch data from Megalithic.UK: {}'.format(response.text))
-
-    content = StringIO(response.text)
-    reader = csv.DictReader(content, delimiter='|')
-
-    data = []
-    for row in reader:
-        data.append({
-            'lat': row['lat'],
-            'lng': row['lng'],
-            'name': row['Name'],
-            'type': row['Type'],
-            'url': 'https://megalithic.co.uk/article.php?sid={}'.format(row['SID']),
-        })
-
-    return data
-
-
-def get_stone_circles_data():
-    response = requests.get('http://www.stone-circles.org.uk/stone/Cluster/Coords/coords.js')
-
-    if response.status_code != 200:
-        raise Exception('Failed to fetch data from stone-circles.org.uk: {}'.format(response.text))
-
-    content = re.match(
-        r'.+ = \[(\[.+]),?];',
-        response.text.replace('\n', '')
-    )
-    content = re.sub(r'\\(?!")', '', content.groups()[0])
-    arr = json.loads('[{}]'.format(content))
-
-    data = []
-    for item in arr:
-        data.append({
-            'lat': item[0],
-            'lng': item[1],
-            'name': re.sub(r'<.+?>', '', re.match(r'<b>(.+)</b>', item[2]).groups()[0]),
-            'type': re.sub(r'.+>', '', item[2].replace('<br>', ' ')),
-            'url': 'http://www.stone-circles.org.uk/stone/{}'.format(re.search(r'href=([a-zA-Z.]+)', item[2]).groups()[0]),
-        })
-
-    return data
-
-
-if __name__ == '__main__':
-    all_data = get_stone_circles_data() # + get_megalithic_data()
-
-    with open('data.json', 'w') as f:
-        json.dump(all_data, f)
diff --git a/gpx.py b/gpx.py
new file mode 100644
index 0000000..c90b988
--- /dev/null
+++ b/gpx.py
@@ -0,0 +1,161 @@
+import csv
+import json
+import argparse
+from html import escape
+from pathlib import Path
+import re
+
+import requests
+
+parser = argparse.ArgumentParser(
+    prog="megalithosm",
+    description="Fetch data from megalith sources and produce a GPX file",
+)
+
+parser.add_argument(
+    "-q",
+    "--quality",
+    default=5,
+    type=int,
+    help="Include sites of this quality or higher",
+)
+parser.add_argument("-r", "--refetch", action="store_true", default=False)
+
+args = parser.parse_args()
+
+country_range = 6
+
+
+def cache_stone_circles():
+    response = requests.get(
+        "http://www.stone-circles.org.uk/stone/Cluster/Coords/coords.js"
+    )
+    if response.status_code != 200:
+        raise Exception(
+            "Failed to fetch data from stone-circles.org.uk: {}".format(response.text)
+        )
+
+    Path("cache").mkdir(exist_ok=True)
+    with open("cache/stone-circles.js", "w") as f:
+        f.write(response.text)
+
+
+def cache_megalithic():
+    def megalithic_url(country):
+        return "https://www.megalithic.co.uk/cache/csvmap_country{}.csv".format(country)
+
+    for country in range(1, country_range):
+        # Megalithic doesn't really want people scraping
+        response = requests.get(
+            megalithic_url(country),
+            headers={
+                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
+            },
+        )
+        if response.status_code != 200:
+            raise Exception(
+                "Failed to fetch data from Megalithic.UK: {}".format(response.text)
+            )
+
+        Path("cache").mkdir(exist_ok=True)
+        with open("cache/megalithic-{}.csv".format(country), "w") as f:
+            f.write(response.text)
+
+
+if args.refetch or not Path("cache/stone-circles.js").exists():
+    print("Refreshing stone-circles.org.uk cache")
+    cache_stone_circles()
+
+if args.refetch or not all(
+    Path("cache/megalithic-{}.csv".format(c)).exists() for c in range(1, country_range)
+):
+    print("Refreshing megalithic.co.uk cache")
+    cache_megalithic()
+
+
+print("Post-processing data")
+data = []
+for country in range(1, country_range):
+    with open("cache/megalithic-{}.csv".format(country)) as f:
+        reader = csv.DictReader(f, delimiter="|")
+        types = set()
+        for row in reader:
+            types.add(row["Type"])
+            data.append(
+                {
+                    "lat": row["lat"],
+                    "lng": row["lng"],
+                    "name": row["Name"],
+                    "type": row["Type"],
+                    "url": "https://megalithic.co.uk/article.php?sid={}".format(
+                        row["SID"]
+                    ),
+                    "quality": int(row["Condition"]),
+                }
+            )
+
+    with open("cache/stone-circles.js") as f:
+        content = f.read()
+        content = re.match(r".+ = \[(\[.+]),?];", content.replace("\n", ""))
+        content = re.sub(r'\\(?!")', "", content.groups()[0])
+        arr = json.loads("[{}]".format(content))
+        for item in arr:
+            data.append(
+                {
+                    "lat": item[0],
+                    "lng": item[1],
+                    "name": re.sub(
+                        r"<.+?>", "", re.match(r"<b>(.+)</b>", item[2]).groups()[0]
+                    ),
+                    "type": re.sub(r".+>", "", item[2].replace("<br>", " ")),
+                    "url": "http://www.stone-circles.org.uk/stone/{}".format(
+                        re.search(r"href=([a-zA-Z.]+)", item[2]).groups()[0]
+                    ),
+                    "quality": 5,
+                }
+            )
+
+
+print("Generating GPX")
+with open("Megaliths.gpx", "w") as gpx_file:
+    gpx_file.write(
+        """<?xml version="1.0" encoding="UTF-8"?>
+<gpx xmlns="http://www.topografix.com/GPX/1/1" 
+     version="1.1" 
+     creator="megalithosm"
+     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+     xsi:schemaLocation="http://www.topografix.com/GPX/1/1 http://www.topografix.com/GPX/1/1/gpx.xsd">
+    <metadata>
+        <name>Megalith sites</name>
+        <author>
+            <name>Jude Southworth</name>
+        </author>
+    </metadata>"""
+    )
+
+    seen_sites = set()
+    for poi in data:
+        norm_name = re.sub(r"\s", "", poi["name"]).lower()
+        site_key = "{},{},{}".format(poi["lat"], poi["lng"], norm_name)
+        if site_key in seen_sites:
+            print("Omitting duplicate site: {}".format(poi["name"]))
+            continue
+        if poi["quality"] < args.quality:
+            continue
+        if poi["type"] in ["Museum", "Modern Stone Circle etc"]:
+            print("Omitting uninteresting feature: {}".format(poi["name"]))
+            continue
+
+        # Deduplicate entries
+        seen_sites.add(site_key)
+        name = "{} ({})".format(poi["name"], poi["type"].strip())
+        gpx_file.write(
+            """
+    <wpt lat="{}" lon="{}">
+        <name>{}</name>
+        <desc>{}</desc>
+    </wpt>""".format(
+                poi["lat"], poi["lng"], escape(name), escape(poi["url"])
+            )
+        )
+    gpx_file.write("""\n</gpx>""")
diff --git a/requirements.txt b/requirements.txt
index cc8680d..5bc81e6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 requests~=2.31
 pandas~=2.2
+black~=24.2
diff --git a/to_gpx.py b/to_gpx.py
deleted file mode 100644
index 82ae066..0000000
--- a/to_gpx.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import json
-from html import escape
-
-with open('data.json') as data_file:
-    data = json.load(data_file)
-
-with open('Megaliths.gpx', 'w') as gpx_file:
-    gpx_file.write('''<?xml version="1.0" encoding="UTF-8"?>
-<gpx xmlns="http://www.topografix.com/GPX/1/1" 
-     version="1.1" 
-     creator="megalithosm" 
-     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
-     xsi:schemaLocation="http://www.topografix.com/GPX/1/1 http://www.topografix.com/GPX/1/1/gpx.xsd">
-    <metadata>
-        <name>Megalith sites</name>
-        <author>
-           <name>Jude Southworth</name>
-        </author>
-    </metadata>''')
-
-    for poi in data:
-        name = '{} ({})'.format(poi['name'], poi['type'].strip())
-
-        gpx_file.write(
-            '''
-    <wpt lat="{}" lon="{}">
-        <name>{}</name>
-        <desc>{}</desc>
-    </wpt>'''.format(poi['lat'], poi['lng'], escape(name), poi['url'])
-        )
-    gpx_file.write('</gpx>')