You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
48 lines
1.5 KiB
Python
48 lines
1.5 KiB
Python
import glob
|
|
import time
|
|
import urllib.error
|
|
import urllib.request
|
|
|
|
import requests
|
|
import rdflib
|
|
|
|
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'}
|
|
|
|
def find_image(rdf_doc):
|
|
"""
|
|
Returns the first dwc:associatedMedia resource in the rdf_doc
|
|
"""
|
|
graph = rdflib.Graph()
|
|
graph.parse(rdf_doc)
|
|
mediaref = rdflib.URIRef("http://rs.tdwg.org/dwc/terms/associatedMedia")
|
|
for subj, pred, obj in graph:
|
|
if pred == mediaref:
|
|
return obj
|
|
|
|
|
|
with open("barcode_cleaned.csv") as bcfile:
|
|
for line in bcfile:
|
|
barcode = line.split(",")[0]
|
|
|
|
# skip files we already have already scraped
|
|
if glob.glob("specimen_img_raw/" + barcode + "*"):
|
|
print("we already have", barcode, "skipping to...")
|
|
continue
|
|
|
|
rdf_url = "https://www.botanicalcollections.be/specimen/" + barcode + "/rdf"
|
|
print("fetching", rdf_url)
|
|
try:
|
|
img_url = find_image(rdf_url)
|
|
except urllib.error.HTTPError as er:
|
|
print("Error on ", rdf_url, str(er))
|
|
continue
|
|
if img_url is None:
|
|
print("Error. No image for barcode.")
|
|
continue
|
|
img_ext = str(img_url).rsplit(".", 1)[1]
|
|
print("image url:", img_url)
|
|
save_path = "specimen_img_raw/" + barcode + "." + img_ext
|
|
save_resp = requests.get(img_url, headers=HEADERS, allow_redirects=True)
|
|
open(save_path, "wb").write(save_resp.content)
|
|
time.sleep(1)
|