initial image scraper code
parent
312cb1d1d1
commit
124ed5801c
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,39 @@
|
||||
import glob
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
import requests
|
||||
import rdflib
|
||||
|
||||
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'}
|
||||
|
||||
def find_image(rdf_doc):
|
||||
"""
|
||||
Returns the first dwc:associatedMedia resource in the rdf_doc
|
||||
"""
|
||||
graph = rdflib.Graph()
|
||||
graph.parse(rdf_doc)
|
||||
mediaref = rdflib.URIRef("http://rs.tdwg.org/dwc/terms/associatedMedia")
|
||||
for subj, pred, obj in graph:
|
||||
if pred == mediaref:
|
||||
return obj
|
||||
|
||||
|
||||
with open("barcode_cleaned.csv") as bcfile:
|
||||
for line in bcfile:
|
||||
barcode = line.split(",")[0]
|
||||
|
||||
# skip files we already have already scraped
|
||||
if glob.glob("specimen_img_raw/" + barcode + "*"):
|
||||
print("we already have", barcode, "skipping to...")
|
||||
continue
|
||||
|
||||
rdf_url = "https://www.botanicalcollections.be/specimen/" + barcode + "/rdf"
|
||||
print("fetching", rdf_url)
|
||||
img_url = find_image(rdf_url)
|
||||
img_ext = str(img_url).rsplit(".", 1)[1]
|
||||
print("image url:", img_url)
|
||||
save_path = "specimen_img_raw/" + barcode + "." + img_ext
|
||||
save_resp = requests.get(img_url, headers=HEADERS, allow_redirects=True)
|
||||
open(save_path, "wb").write(save_resp.content)
|
||||
break
|
@ -0,0 +1,9 @@
|
||||
certifi==2022.5.18.1
|
||||
charset-normalizer==2.0.12
|
||||
idna==3.3
|
||||
isodate==0.6.1
|
||||
pyparsing==3.0.9
|
||||
rdflib==6.1.1
|
||||
requests==2.27.1
|
||||
six==1.16.0
|
||||
urllib3==1.26.9
|
@ -0,0 +1 @@
|
||||
These are the unprocessed herbarium images. The filenames map to the barcodes.
|
Loading…
Reference in New Issue