import geopandas as gpd
import json
from pathlib import Path
from shapely.geometry import Point
from shapely.geometry import Polygon
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import urllib
from urllib import parse
from skimage import io
BirdWatch Ireland - Species & Habitat Conservation
BirdWatch Ireland is the largest independent conservation organisation in Ireland and their objective is the protection of wild birds and their habitats. They have been doing an incredible work protecting birds and biodiversity in Ireland. Check out their amazing work here.
The data is collected using Web Scraping technique from BirdWatch Ireland’s website. There is a list of Ireland’s birds with detailed information on every species.
To create our dataset and merge with the data provided by the Department of Agriculture, Food and the Marine, the focus will be on the bird’s image and common name.
Note: scikit-image imread
: OpenCV represents images in BGR order, whereas scikit-image represents images in RGB order. To utilize OpenCV functions after downloading the image, there is a extra step which is to convert the image from RGB to BGR.
# div class birds-with-filters (Parent)
# page/2/
no_pages = 24
def get_data(pageNo):
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0)
Gecko/20100101 Firefox/66.0",
"Accept-Encoding":"gzip, deflate",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
r = requests.get('https://birdwatchireland.ie/irelands-birds-birdwatch-ireland/list-of-irelands-birds/'+str(pageNo), headers=headers)#, proxies=proxies)
content = r.content
soup = BeautifulSoup(content)
#print(soup)
alls = []
for d in soup.findAll('article', attrs={'class':'bird'}):
# image
image_div = d.find('div', attrs={'class':'bird-img'})
img_html = image_div.find('img')
# text
bird_name = d.find('h3', attrs={'class':'title'})
complementary_info = d.find('div', attrs={'class':'bird-info'})
bird_info = complementary_info.find_all('p')
irish_name = bird_info[0]
scientific_name = bird_info[1]
bird_family = bird_info[2]
all1=[]
if img_html is not None:
str_image = img_html['data-src']
# there is a URL on Page 20 with an accent
# Snow Goose - https://birdwatchireland.ie/app/uploads/2019/02/Snowy-Owl-08-with-kill-René-Bruun.jpg
# for that reason it needs to parse.quote() ignoring : and /, otherwise it will throw an error
image = io.imread(urllib.parse.quote(str_image, safe=':/'))
all1.append(image)
else:
all1.append('0')
if bird_name is not None:
all1.append(bird_name.text) if bird_name.text != '' else all1.append(np.nan)
else:
all1.append('0')
if irish_name is not None:
all1.append(irish_name.text) if irish_name.text != '' else all1.append(np.nan)
else:
all1.append('0')
if scientific_name is not None:
all1.append(scientific_name.text) if scientific_name.text != '' else all1.append(np.nan)
else:
all1.append('0')
if bird_family is not None:
all1.append(bird_family.text) if bird_family.text != '' else all1.append(np.nan)
else:
all1.append('0')
alls.append(all1)
return alls
results = []
# for i in range(1, no_pages+1):
for i in range(1, no_pages+1):
url_to_append = "page/{}/".format(i)
results.append(get_data(url_to_append))
print(url_to_append+': OK')
flatten = lambda l: [item for sublist in l for item in sublist]
data_webscraping = flatten(results)
df_birds = pd.DataFrame(data_webscraping, columns=['Image','Bird_Name','Irish_Name','Scientific_Name','Bird_Family'])
df_birds.to_pickle('./data/BirdWatchIreland.pkl')
#df_birds.to_csv('./data/BirdWatchIreland.csv', index=False)
H5N1 Wild Bird Species Identification
Dataset provided by Ireland’s Department of Agriculture, Food and the Marine which contains the locations of bird species captured in Ireland from 1980-09-01 to 2020-01-27 and wild birds that are targeted for the H5N1 strain of avian flu.
wild_birds = pd.read_csv("./data/98696_58589762-e8f9-4bb0-9d39-09570efbad62.xls", encoding='latin-1')
birdwatch = pd.read_pickle('./data/BirdWatchIreland.pkl')
wild_birds.head()
Scientific_Name | Common_Name | Date | Year | Month | Day | Time | Country | Country_State_County | State | County | Locality | Latitude | Longitude | Parent_Species | target_H5_HPAI | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Acrocephalus scirpaceus | Eurasian Reed Warbler | 15/09/2015 | 2015 | 9 | 15 | 1100 | Ireland | IE-C-GY | Connaught | Galway | Inishmore (Inis Mór) | 53.1291 | -9.7507 | Acrocephalus scirpaceus | 0 |
1 | Acrocephalus scirpaceus | Eurasian Reed Warbler | 15/09/2015 | 2015 | 9 | 15 | 1100 | Ireland | IE-C-GY | Connaught | Galway | Inishmore (Inis Mór) | 53.1291 | -9.7507 | Acrocephalus scirpaceus | 0 |
2 | Limosa haemastica | Hudsonian Godwit | 15/09/2015 | 2015 | 9 | 15 | 1100 | Ireland | IE-C-GY | Connaught | Galway | Inishmore (Inis Mór) | 53.1291 | -9.7507 | Limosa haemastica | 0 |
3 | Limosa haemastica | Hudsonian Godwit | 15/09/2015 | 2015 | 9 | 15 | 1100 | Ireland | IE-C-GY | Connaught | Galway | Inishmore (Inis Mór) | 53.1291 | -9.7507 | Limosa haemastica | 0 |
4 | Limosa haemastica | Hudsonian Godwit | 15/09/2015 | 2015 | 9 | 15 | 1100 | Ireland | IE-C-GY | Connaught | Galway | Inishmore (Inis Mór) | 53.1291 | -9.7507 | Limosa haemastica | 0 |
JOIN the two bird datasets to link image to the specie. Using Scientific Name as key column
birdwatch.head()
Image | Bird_Name | Irish_Name | Scientific_Name | Bird_Family | |
---|---|---|---|---|---|
0 | [[[193, 145, 9], [189, 141, 5], [185, 137, 1],... | Arctic Tern | Geabhróg artach | Sterna paradisaea | Terns |
1 | [[[33, 72, 87], [35, 74, 89], [37, 76, 93], [4... | Balearic Shearwater | Cánóg Bhailéarach | Puffinus mauretanicus | Tubenoses |
2 | [[[52, 64, 88], [55, 67, 91], [57, 69, 93], [5... | Bar-tailed Godwit | Guilbneach stríocearrach | Limosa lapponica | Waders |
3 | [[[89, 78, 46], [88, 77, 45], [85, 74, 42], [8... | Barn Owl | Scréachóg reilige | Tyto alba | Owls |
4 | [[[130, 112, 74], [129, 111, 73], [127, 109, 7... | Barnacle Goose | Gé ghiúrainn | Branta leucopsis | Geese |
bird_flu = wild_birds.join(birdwatch.set_index('Scientific_Name'), on='Scientific_Name', lsuffix='_original', rsuffix='_bwi')
# Selecting only infected birds
infected_birds = bird_flu[bird_flu['target_H5_HPAI'] == 1]
top_infected_species = infected_birds.groupby('Scientific_Name').size().sort_values(ascending=False)
top_infected_species
Scientific_Name
Chroicocephalus ridibundus 332
Cygnus olor 273
Ardea cinerea 259
Egretta garzetta 227
Larus marinus 152
Anas platyrhynchos 144
Buteo buteo 121
Aythya fuligula 119
Pica pica 118
Cygnus cygnus 105
Falco peregrinus 92
Tadorna tadorna 83
Tachybaptus ruficollis 78
Larus canus 76
Branta bernicla 63
Somateria mollissima 44
Anser anser 43
Anas acuta 40
Aythya marila 37
Anser brachyrhynchus 33
Podiceps cristatus 29
Anas crecca 25
Turdus pilaris 21
Branta canadensis 18
Aythya ferina 18
Bucephala clangula 16
Mergus merganser 15
Haliaeetus albicilla 4
dtype: int64
infected_birds_new = top_infected_species.to_frame().join(birdwatch.set_index('Scientific_Name'), on='Scientific_Name', lsuffix='_original', rsuffix='_bwi')
infected_birds_new[infected_birds_new.isna().any(axis=1)]
0 | Image | Bird_Name | Irish_Name | Bird_Family | |
---|---|---|---|---|---|
Scientific_Name | |||||
Chroicocephalus ridibundus | 332 | NaN | NaN | NaN | NaN |
Branta bernicla | 63 | NaN | NaN | NaN | NaN |
Aythya marila | 37 | NaN | NaN | NaN | NaN |
There are 3 species among those classified as Infected Birds which do not have image. Something might be different on those two datasets. Let’s investigate.
- Chroicocephalus ridibundus
- Branta bernicla
- Aythya marila
Chroicocephalus ridibundus
: have the same Common Name Black-headed Gull on both dataset but on BirdWatch Ireland’s dataset the Scientific Name is Larus ridibundus. I will switch the Scientific name so that the Join can work.
Branta bernicla
: There are three subspecies of Brant (or Brent) Goose.
- Branta bernicla nigricans — Black-bellied Brant of extreme north-east Siberia to north central Canada.
- Branta bernicla bernicla — Dark-bellied Brant of northern and central Siberia.
- Branta bernicla hrota — Pale-bellied Brant of Canada, Greenland, Svalbard and Franz Josef Land.
A fourth Brent Goose population has been recorded in Ireland, though its taxonomic status remains uncertain and it has no scientific name. It is generally known colloquially as ‘Grey-bellied Brant’.
Department of Agriculture, Food and the Marine dataset
- Branta bernicla 63
- Branta bernicla hrota 45
- Branta bernicla bernicla 4
- Branta bernicla (Gray-bellied) 1
Black-bellied Brant (nigricans) is very similar to Light-bellied Brent Goose (hrota) and care is needed to distinguish the two species. Brent Goose (Dark-bellied) and Black Brant are a rare winter visitor. As Brent Goose (Light-bellied) is the most common species in Ireland and it is easily mistaken by Black-bellied Brant (nigricans), I will combine Pale-bellied Brant and Black-bellied Brant.
Sources:
https://www.waterfowl.org.uk/wildfowl/swans-geese-allies/brent-goose/
https://www.birdguides.com/articles/identification/brent-geese-photo-id-guide/
Aythya marila
: Greater Scaup from the Duck family is under the Scientific Name Anas marila on BirdWatch Ireland’s dataset.
wild_birds_copy = wild_birds.copy()
birdwatch_copy = birdwatch.copy()
# Fixing 1st issue 'Chroicocephalus ridibundus'
birdwatch_copy['Scientific_Name'] = birdwatch_copy['Scientific_Name'].replace('Larus ridibundus','Chroicocephalus ridibundus')
# Different Brent Geese species
wild_birds_copy[wild_birds_copy['Scientific_Name'].str.startswith('Branta bernicla')].drop_duplicates(subset='Common_Name').drop(['Year', 'Month','Day','Time','Country','Country_State_County','State','Latitude','Longitude'], axis='columns')
Scientific_Name | Common_Name | Date | County | Locality | Parent_Species | target_H5_HPAI | |
---|---|---|---|---|---|---|---|
179 | Branta bernicla | Brant | 30/04/2016 | Donegal | Donegal | Branta bernicla | 1 |
1055 | Branta bernicla hrota | Brant (Atlantic) | 13/12/2016 | Wexford | Wexford Wildfowl Reserve | Branta bernicla | 0 |
6235 | Branta bernicla bernicla | Brant (Dark-bellied) | 03/01/2019 | Galway | Barna Pier, County Galway, IE (53.249, -9.15) | Branta bernicla | 0 |
6839 | Branta bernicla (Gray-bellied) | Brant (Gray-bellied) | 18/03/2019 | Louth | Dundalk Bay--Lurgangreen (hide and saltmarsh) | Branta bernicla | 0 |
birdwatch_copy[birdwatch_copy['Irish_Name'] == 'Cadhan']
Image | Bird_Name | Irish_Name | Scientific_Name | Bird_Family | |
---|---|---|---|---|---|
7 | [[[36, 18, 4], [33, 15, 1], [42, 24, 10], [49,... | Black Brant | Cadhan | Branta bernicla nigricans | Geese |
18 | [[[157, 171, 184], [157, 171, 184], [156, 170,... | Brent Goose (Dark-bellied) | Cadhan | Branta bernicla bernicla | Geese |
19 | [[[153, 176, 190], [153, 176, 190], [153, 176,... | Brent Goose (Light-bellied) | Cadhan | Branta bernicla hrota | Geese |
#infected_birds = bird_flu[bird_flu['target_H5_HPAI'] == 1]
top_infected_species = wild_birds_copy.groupby('Scientific_Name').size().sort_values(ascending=False)
top_infected_species[top_infected_species.index.str.startswith('Branta bernicla')]
Scientific_Name
Branta bernicla 63
Branta bernicla hrota 45
Branta bernicla bernicla 4
Branta bernicla (Gray-bellied) 1
dtype: int64
# Fixing 2nd issue 'Branta bernicla'
wild_birds_copy['Scientific_Name'] = wild_birds_copy['Scientific_Name'].replace(['Branta bernicla'],'Branta bernicla hrota')
birdwatch_copy[birdwatch_copy['Bird_Family'] == 'Ducks']
Image | Bird_Name | Irish_Name | Scientific_Name | Bird_Family | |
---|---|---|---|---|---|
32 | [[[179, 185, 185], [179, 185, 185], [178, 184,... | Common Scoter | Scótar | Melanitta nigra | Ducks |
46 | [[[32, 36, 45], [33, 37, 46], [34, 38, 47], [3... | Eider | Éadar | Somateria mollissima | Ducks |
51 | [[[96, 131, 163], [96, 131, 163], [96, 131, 16... | Gadwall | Gadual | Anas strepera | Ducks |
54 | [[[119, 149, 77], [120, 150, 78], [122, 149, 7... | Garganey | Praslacha shamhraidh | Anas querquedula | Ducks |
59 | [[[159, 169, 171], [158, 168, 170], [158, 168,... | Goldeneye | Órshúileach | Bucephala clangula | Ducks |
61 | [[[66, 67, 51], [69, 70, 54], [72, 73, 57], [7... | Goosander | Síolta mhór | Mergus merganser | Ducks |
71 | [[[180, 196, 212], [180, 196, 212], [180, 196,... | Green-winged Teal | Praslacha ghlaseiteach | Anas carolinensis | Ducks |
108 | [[[26, 24, 12], [26, 24, 12], [26, 24, 12], [2... | Long-tailed Duck | Lacha earrfhada | Clangula hyemalis | Ducks |
111 | [[[119, 154, 192], [120, 155, 193], [122, 157,... | Mallard | Mallard | Anas platyrhynchos | Ducks |
128 | [[[102, 91, 59], [96, 85, 53], [91, 81, 46], [... | Pintail | Biorearrach | Anas acuta | Ducks |
129 | [[[193, 196, 203], [193, 196, 203], [193, 196,... | Pochard | Póiseard cíordhearg | Aythya ferina | Ducks |
137 | [[[21, 19, 20], [21, 19, 20], [21, 19, 20], [2... | Red-breasted Merganser | Síolta rua | Mergus serrator | Ducks |
149 | [[[89, 88, 86], [89, 88, 86], [90, 89, 87], [9... | Ring-necked Duck | Lacha mhuinceach | Aythya collaris | Ducks |
156 | [[[65, 72, 38], [65, 72, 38], [65, 72, 38], [6... | Ruddy Duck | Lacha rua | Oxyura jamaicensis | Ducks |
162 | [[[171, 161, 102], [167, 157, 98], [164, 152, ... | Scaup | Lacha iascán | Anas marila | Ducks |
165 | [[[62, 85, 116], [61, 84, 115], [61, 84, 115],... | Shelduck | Seil-lacha | Tadorna tadorna | Ducks |
167 | [[[105, 117, 129], [105, 117, 129], [105, 117,... | Shoveler | Spadalach | Anas clypeata | Ducks |
171 | [[[88, 109, 136], [87, 108, 135], [87, 108, 13... | Smew | Síolta gheal | Mergellus albellus | Ducks |
185 | [[[81, 107, 142], [86, 112, 147], [91, 117, 15... | Surf Scoter | Scótar toinne | Mellanitta perspicillata | Ducks |
188 | [[[204, 208, 217], [204, 208, 217], [204, 208,... | Teal | Praslacha | Anas crecca | Ducks |
191 | [[[159, 159, 157], [159, 159, 157], [159, 159,... | Tufted Duck | Lacha bhadánach | Aythya fuligula | Ducks |
195 | [[[131, 124, 105], [139, 132, 114], [117, 110,... | Velvet Scoter | Sceadach | Mellanitta fusca | Ducks |
204 | [[[51, 48, 39], [46, 43, 34], [42, 39, 30], [4... | Wigeon | Rualacha | Anas penelope | Ducks |
# Fixing 3rd issue 'Anas marila'
birdwatch_copy['Scientific_Name'] = birdwatch_copy['Scientific_Name'].replace('Anas marila','Aythya marila')
final_df = wild_birds_copy.join(birdwatch_copy.set_index('Scientific_Name'), on='Scientific_Name', lsuffix='_original', rsuffix='_bwi')
final_df.to_pickle('./data/bird-flu.pkl')
Ordnance Survey Ireland (OSi) - Ireland’s National Mapping Agency
The spacial data is provided by Ordnance Survey Ireland (OSi) under Creative Commons licence.
Ordnance Survey Ireland has evolved from the Ordnance Survey Office which was established in 1824, later becoming a state body under the Ordnance Survey Ireland Act 2001. Under this Act, Ordnance Survey Ireland continued its mainstream public service function of creating and maintaining the definitive mapping records of the State and also assumed the commercial function assigned to it under the Act of developing its commercial business and sales revenues.
Administrative Areas dataset generated from the 2019 OSi National Statutory Boundary dataset.
Dataset License: https://creativecommons.org/licenses/by/4.0/
url_geoJSON = 'https://opendata.arcgis.com/datasets/0d5984f732c54246bd087768223c92eb_0.geojson'
admin_areas_json = 'data/Administrative_Areas_Ireland.json'
# GeoJSON API
admin_areas = gpd.read_file(url_geoJSON, driver='GeoJSON')
Adding Count of bird flu occurences on each Administrative Area
avian_flu = wild_birds.copy()
avian_flu['geometry'] = None
for index, row in avian_flu.iterrows():
avian_flu.loc[index, 'geometry'] = Point(row.Longitude, row.Latitude)
Coordinate Reference System (CRS): Setting a projection with Spatial Reference EPSG Code
gdf_infected_birds = gpd.GeoDataFrame(avian_flu, geometry='geometry').set_crs(epsg=29902, inplace=True)
On the dataset with birds’ information we have only Latitude and Longitude, so first I convert them in geometry Points to use later on a polygon operation. After that, a loop is created and for every point an intersect operation is done to check if the Point belongs to that Polygon (Administrative Area).
# adding Count of bird flu occurences on each Administrative Area
for index, area in admin_areas.iterrows():
count_infected_birds = len(gdf_infected_birds[(gdf_infected_birds['target_H5_HPAI'] == 1) & (gdf_infected_birds.intersects(area.geometry)) ])
count_healthy_birds = len(gdf_infected_birds[(gdf_infected_birds['target_H5_HPAI'] == 0) & (gdf_infected_birds.intersects(area.geometry)) ])
total_birds = count_healthy_birds + count_infected_birds
admin_areas.loc[index, 'TOTAL_BIRDS'] = total_birds
admin_areas.loc[index, 'HEALTHY_BIRDS'] = count_healthy_birds
admin_areas.loc[index, 'INFECTED_BIRDS'] = count_infected_birds
admin_areas.head()
ENGLISH | GAEILGE | CONTAE | COUNTY | PROVINCE | GUID | CENTROID_X | CENTROID_Y | AREA | CC_ID | OBJECTID | Shape__Area | Shape__Length | geometry | TOTAL_BIRDS | HEALTHY_BIRDS | INFECTED_BIRDS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | DUBLIN CITY COUNCIL | None | Baile Átha Cliath | DUBLIN | Leinster | 2ae19629-1433-13a3-e055-000000000001 | 716469.75 | 735272.06 | 1.283502e+08 | 265011 | 1 | 1.283502e+08 | 101493.212412 | POLYGON ((-6.38258 53.33367, -6.38261 53.33370... | 2161.0 | 1642.0 | 519.0 |
1 | CORK CITY COUNCIL | None | Corcaigh | CORK | Munster | 2ae19629-1434-13a3-e055-000000000001 | 565833.13 | 571933.83 | 1.865976e+08 | 45511 | 2 | 1.865976e+08 | 80293.730785 | POLYGON ((-8.38436 51.90533, -8.38425 51.90529... | 223.0 | 182.0 | 41.0 |
2 | GALWAY CITY COUNCIL | None | Gaillimh | GALWAY | Connacht | 2ae19629-1435-13a3-e055-000000000001 | 530067.66 | 726500.52 | 5.069505e+07 | 65011 | 3 | 5.069505e+07 | 64020.725628 | MULTIPOLYGON (((-9.13605 53.26682, -9.13606 53... | 989.0 | 730.0 | 259.0 |
3 | OFFALY COUNTY COUNCIL | None | Uíbh Fhailí | OFFALY | Leinster | 2ae19629-1496-13a3-e055-000000000001 | 631261.72 | 709672.35 | 2.000025e+09 | 185001 | 4 | 2.000025e+09 | 389927.708615 | POLYGON ((-7.97902 53.33689, -7.97878 53.33684... | 76.0 | 69.0 | 7.0 |
4 | WICKLOW COUNTY COUNCIL | None | Cill Mhantáin | WICKLOW | Leinster | 2ae19629-149e-13a3-e055-000000000001 | 707784.79 | 690738.10 | 2.025161e+09 | 255001 | 5 | 2.025161e+09 | 320629.958733 | MULTIPOLYGON (((-6.14602 52.78372, -6.14607 52... | 1231.0 | 1062.0 | 169.0 |
# Saving the data downloaded to a local json file
admin_areas.to_file(admin_areas_json, driver='GeoJSON')