initial image scraper code

3 years ago · 124ed5801c
parent 312cb1d1d1
commit 124ed5801c
4 changed files with 2208 additions and 0 deletions
--- a/barcode_cleaned.csv
+++ b/barcode_cleaned.csv
--- a/image_fetch.py
+++ b/image_fetch.py
@ -0,0 +1,39 @@
+import glob
+import time
+import urllib.request
+
+import requests
+import rdflib
+
+HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'}
+
+def find_image(rdf_doc):
+    """
+    Returns the first dwc:associatedMedia resource in the rdf_doc
+    """
+    graph = rdflib.Graph()
+    graph.parse(rdf_doc)
+    mediaref = rdflib.URIRef("http://rs.tdwg.org/dwc/terms/associatedMedia")
+    for subj, pred, obj in graph:
+        if pred == mediaref:
+            return obj
+
+
+with open("barcode_cleaned.csv") as bcfile:
+    for line in bcfile:
+        barcode = line.split(",")[0]
+
+        # skip files we already have already scraped
+        if glob.glob("specimen_img_raw/" + barcode + "*"):
+            print("we already have", barcode, "skipping to...")
+            continue
+
+        rdf_url = "https://www.botanicalcollections.be/specimen/" + barcode + "/rdf"
+        print("fetching", rdf_url)
+        img_url = find_image(rdf_url)
+        img_ext = str(img_url).rsplit(".", 1)[1]
+        print("image url:", img_url)
+        save_path = "specimen_img_raw/" + barcode + "." + img_ext
+        save_resp = requests.get(img_url, headers=HEADERS, allow_redirects=True)
+        open(save_path, "wb").write(save_resp.content)
+        break
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,9 @@
+certifi==2022.5.18.1
+charset-normalizer==2.0.12
+idna==3.3
+isodate==0.6.1
+pyparsing==3.0.9
+rdflib==6.1.1
+requests==2.27.1
+six==1.16.0
+urllib3==1.26.9
--- a/specimen_img_raw/README.md
+++ b/specimen_img_raw/README.md
@ -0,0 +1 @@
+These are the unprocessed herbarium images. The filenames map to the barcodes.
				`@ -0,0 +1 @@`
				`These are the unprocessed herbarium images. The filenames map to the barcodes.`