Skip to the content.

« BACK

import geopandas as gpd
import json
from pathlib import Path
from shapely.geometry import Point
from shapely.geometry import Polygon

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import urllib
from urllib import parse
from skimage import io

BirdWatch Ireland - Species & Habitat Conservation

BirdWatch IrelandBirdWatch Ireland is the largest independent conservation organisation in Ireland and their objective is the protection of wild birds and their habitats. They have been doing an incredible work protecting birds and biodiversity in Ireland. Check out their amazing work here.

The data is collected using Web Scraping technique from BirdWatch Ireland’s website. There is a list of Ireland’s birds with detailed information on every species.

To create our dataset and merge with the data provided by the Department of Agriculture, Food and the Marine, the focus will be on the bird’s image and common name.

Note: scikit-image imread: OpenCV represents images in BGR order, whereas scikit-image represents images in RGB order. To utilize OpenCV functions after downloading the image, there is a extra step which is to convert the image from RGB to BGR.

# div class birds-with-filters (Parent)
# page/2/
no_pages = 24

def get_data(pageNo):  
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) 
    Gecko/20100101 Firefox/66.0", 
    "Accept-Encoding":"gzip, deflate", 
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    r = requests.get('https://birdwatchireland.ie/irelands-birds-birdwatch-ireland/list-of-irelands-birds/'+str(pageNo), headers=headers)#, proxies=proxies)
    content = r.content
    soup = BeautifulSoup(content)
    #print(soup)

    alls = []
    for d in soup.findAll('article', attrs={'class':'bird'}):
        
        # image
        image_div = d.find('div', attrs={'class':'bird-img'})
        img_html = image_div.find('img')

        # text
        bird_name = d.find('h3', attrs={'class':'title'})

        complementary_info = d.find('div', attrs={'class':'bird-info'})
        bird_info = complementary_info.find_all('p')
        irish_name = bird_info[0]
        scientific_name = bird_info[1]
        bird_family = bird_info[2]

        all1=[]

        if img_html is not None:
            str_image = img_html['data-src']

            # there is a URL on Page 20 with an accent 
            # Snow Goose - https://birdwatchireland.ie/app/uploads/2019/02/Snowy-Owl-08-with-kill-René-Bruun.jpg
            # for that reason it needs to parse.quote() ignoring : and /, otherwise it will throw an error

            image = io.imread(urllib.parse.quote(str_image, safe=':/'))
            all1.append(image)
        else:
            all1.append('0')

        if bird_name is not None:
            all1.append(bird_name.text) if bird_name.text != '' else all1.append(np.nan)
        else:    
            all1.append('0')

        if irish_name is not None:
            all1.append(irish_name.text) if irish_name.text != '' else all1.append(np.nan)
        else:
            all1.append('0')

        if scientific_name is not None:
            all1.append(scientific_name.text) if scientific_name.text != '' else all1.append(np.nan)
        else:
            all1.append('0')

        if bird_family is not None:
            all1.append(bird_family.text) if bird_family.text != '' else all1.append(np.nan)
        else:
            all1.append('0')

        alls.append(all1)    

    return alls
results = []
# for i in range(1, no_pages+1):
for i in range(1, no_pages+1):
    url_to_append = "page/{}/".format(i)
    results.append(get_data(url_to_append))
    print(url_to_append+': OK')

flatten = lambda l: [item for sublist in l for item in sublist]
data_webscraping = flatten(results)
df_birds = pd.DataFrame(data_webscraping, columns=['Image','Bird_Name','Irish_Name','Scientific_Name','Bird_Family'])
df_birds.to_pickle('./data/BirdWatchIreland.pkl')
#df_birds.to_csv('./data/BirdWatchIreland.csv', index=False)

H5N1 Wild Bird Species Identification

Department of Agriculture, Food and the Marine

Dataset provided by Ireland’s Department of Agriculture, Food and the Marine which contains the locations of bird species captured in Ireland from 1980-09-01 to 2020-01-27 and wild birds that are targeted for the H5N1 strain of avian flu.

wild_birds = pd.read_csv("./data/98696_58589762-e8f9-4bb0-9d39-09570efbad62.xls", encoding='latin-1')
birdwatch = pd.read_pickle('./data/BirdWatchIreland.pkl')
wild_birds.head()
Scientific_NameCommon_NameDateYearMonthDayTimeCountryCountry_State_CountyStateCountyLocalityLatitudeLongitudeParent_Speciestarget_H5_HPAI
0Acrocephalus scirpaceusEurasian Reed Warbler15/09/201520159151100IrelandIE-C-GYConnaughtGalwayInishmore (Inis Mór)53.1291-9.7507Acrocephalus scirpaceus0
1Acrocephalus scirpaceusEurasian Reed Warbler15/09/201520159151100IrelandIE-C-GYConnaughtGalwayInishmore (Inis Mór)53.1291-9.7507Acrocephalus scirpaceus0
2Limosa haemasticaHudsonian Godwit15/09/201520159151100IrelandIE-C-GYConnaughtGalwayInishmore (Inis Mór)53.1291-9.7507Limosa haemastica0
3Limosa haemasticaHudsonian Godwit15/09/201520159151100IrelandIE-C-GYConnaughtGalwayInishmore (Inis Mór)53.1291-9.7507Limosa haemastica0
4Limosa haemasticaHudsonian Godwit15/09/201520159151100IrelandIE-C-GYConnaughtGalwayInishmore (Inis Mór)53.1291-9.7507Limosa haemastica0

JOIN the two bird datasets to link image to the specie. Using Scientific Name as key column

birdwatch.head()
ImageBird_NameIrish_NameScientific_NameBird_Family
0[[[193, 145, 9], [189, 141, 5], [185, 137, 1],...Arctic TernGeabhróg artachSterna paradisaeaTerns
1[[[33, 72, 87], [35, 74, 89], [37, 76, 93], [4...Balearic ShearwaterCánóg BhailéarachPuffinus mauretanicusTubenoses
2[[[52, 64, 88], [55, 67, 91], [57, 69, 93], [5...Bar-tailed GodwitGuilbneach stríocearrachLimosa lapponicaWaders
3[[[89, 78, 46], [88, 77, 45], [85, 74, 42], [8...Barn OwlScréachóg reiligeTyto albaOwls
4[[[130, 112, 74], [129, 111, 73], [127, 109, 7...Barnacle GooseGé ghiúrainnBranta leucopsisGeese
bird_flu = wild_birds.join(birdwatch.set_index('Scientific_Name'), on='Scientific_Name', lsuffix='_original', rsuffix='_bwi')
# Selecting only infected birds
infected_birds = bird_flu[bird_flu['target_H5_HPAI'] == 1]
top_infected_species = infected_birds.groupby('Scientific_Name').size().sort_values(ascending=False)
top_infected_species
Scientific_Name
Chroicocephalus ridibundus    332
Cygnus olor                   273
Ardea cinerea                 259
Egretta garzetta              227
Larus marinus                 152
Anas platyrhynchos            144
Buteo buteo                   121
Aythya fuligula               119
Pica pica                     118
Cygnus cygnus                 105
Falco peregrinus               92
Tadorna tadorna                83
Tachybaptus ruficollis         78
Larus canus                    76
Branta bernicla                63
Somateria mollissima           44
Anser anser                    43
Anas acuta                     40
Aythya marila                  37
Anser brachyrhynchus           33
Podiceps cristatus             29
Anas crecca                    25
Turdus pilaris                 21
Branta canadensis              18
Aythya ferina                  18
Bucephala clangula             16
Mergus merganser               15
Haliaeetus albicilla            4
dtype: int64
infected_birds_new = top_infected_species.to_frame().join(birdwatch.set_index('Scientific_Name'), on='Scientific_Name', lsuffix='_original', rsuffix='_bwi')
infected_birds_new[infected_birds_new.isna().any(axis=1)]
0ImageBird_NameIrish_NameBird_Family
Scientific_Name
Chroicocephalus ridibundus332NaNNaNNaNNaN
Branta bernicla63NaNNaNNaNNaN
Aythya marila37NaNNaNNaNNaN

There are 3 species among those classified as Infected Birds which do not have image. Something might be different on those two datasets. Let’s investigate.

Chroicocephalus ridibundus: have the same Common Name Black-headed Gull on both dataset but on BirdWatch Ireland’s dataset the Scientific Name is Larus ridibundus. I will switch the Scientific name so that the Join can work.


Branta bernicla: There are three subspecies of Brant (or Brent) Goose.

A fourth Brent Goose population has been recorded in Ireland, though its taxonomic status remains uncertain and it has no scientific name. It is generally known colloquially as ‘Grey-bellied Brant’.

Department of Agriculture, Food and the Marine dataset

Black-bellied Brant (nigricans) is very similar to Light-bellied Brent Goose (hrota) and care is needed to distinguish the two species. Brent Goose (Dark-bellied) and Black Brant are a rare winter visitor. As Brent Goose (Light-bellied) is the most common species in Ireland and it is easily mistaken by Black-bellied Brant (nigricans), I will combine Pale-bellied Brant and Black-bellied Brant.

Sources:

https://www.waterfowl.org.uk/wildfowl/swans-geese-allies/brent-goose/

https://www.birdguides.com/articles/identification/brent-geese-photo-id-guide/


Aythya marila: Greater Scaup from the Duck family is under the Scientific Name Anas marila on BirdWatch Ireland’s dataset.

wild_birds_copy = wild_birds.copy()
birdwatch_copy = birdwatch.copy()
# Fixing 1st issue 'Chroicocephalus ridibundus'
birdwatch_copy['Scientific_Name'] = birdwatch_copy['Scientific_Name'].replace('Larus ridibundus','Chroicocephalus ridibundus')
# Different Brent Geese species
wild_birds_copy[wild_birds_copy['Scientific_Name'].str.startswith('Branta bernicla')].drop_duplicates(subset='Common_Name').drop(['Year', 'Month','Day','Time','Country','Country_State_County','State','Latitude','Longitude'], axis='columns')
Scientific_NameCommon_NameDateCountyLocalityParent_Speciestarget_H5_HPAI
179Branta berniclaBrant30/04/2016DonegalDonegalBranta bernicla1
1055Branta bernicla hrotaBrant (Atlantic)13/12/2016WexfordWexford Wildfowl ReserveBranta bernicla0
6235Branta bernicla berniclaBrant (Dark-bellied)03/01/2019GalwayBarna Pier, County Galway, IE (53.249, -9.15)Branta bernicla0
6839Branta bernicla (Gray-bellied)Brant (Gray-bellied)18/03/2019LouthDundalk Bay--Lurgangreen (hide and saltmarsh)Branta bernicla0
birdwatch_copy[birdwatch_copy['Irish_Name'] == 'Cadhan']
ImageBird_NameIrish_NameScientific_NameBird_Family
7[[[36, 18, 4], [33, 15, 1], [42, 24, 10], [49,...Black BrantCadhanBranta bernicla nigricansGeese
18[[[157, 171, 184], [157, 171, 184], [156, 170,...Brent Goose (Dark-bellied)CadhanBranta bernicla berniclaGeese
19[[[153, 176, 190], [153, 176, 190], [153, 176,...Brent Goose (Light-bellied)CadhanBranta bernicla hrotaGeese
#infected_birds = bird_flu[bird_flu['target_H5_HPAI'] == 1]
top_infected_species = wild_birds_copy.groupby('Scientific_Name').size().sort_values(ascending=False)
top_infected_species[top_infected_species.index.str.startswith('Branta bernicla')]
Scientific_Name
Branta bernicla                   63
Branta bernicla hrota             45
Branta bernicla bernicla           4
Branta bernicla (Gray-bellied)     1
dtype: int64
# Fixing 2nd issue 'Branta bernicla'
wild_birds_copy['Scientific_Name'] = wild_birds_copy['Scientific_Name'].replace(['Branta bernicla'],'Branta bernicla hrota')
birdwatch_copy[birdwatch_copy['Bird_Family'] == 'Ducks']
ImageBird_NameIrish_NameScientific_NameBird_Family
32[[[179, 185, 185], [179, 185, 185], [178, 184,...Common ScoterScótarMelanitta nigraDucks
46[[[32, 36, 45], [33, 37, 46], [34, 38, 47], [3...EiderÉadarSomateria mollissimaDucks
51[[[96, 131, 163], [96, 131, 163], [96, 131, 16...GadwallGadualAnas streperaDucks
54[[[119, 149, 77], [120, 150, 78], [122, 149, 7...GarganeyPraslacha shamhraidhAnas querquedulaDucks
59[[[159, 169, 171], [158, 168, 170], [158, 168,...GoldeneyeÓrshúileachBucephala clangulaDucks
61[[[66, 67, 51], [69, 70, 54], [72, 73, 57], [7...GoosanderSíolta mhórMergus merganserDucks
71[[[180, 196, 212], [180, 196, 212], [180, 196,...Green-winged TealPraslacha ghlaseiteachAnas carolinensisDucks
108[[[26, 24, 12], [26, 24, 12], [26, 24, 12], [2...Long-tailed DuckLacha earrfhadaClangula hyemalisDucks
111[[[119, 154, 192], [120, 155, 193], [122, 157,...MallardMallardAnas platyrhynchosDucks
128[[[102, 91, 59], [96, 85, 53], [91, 81, 46], [...PintailBiorearrachAnas acutaDucks
129[[[193, 196, 203], [193, 196, 203], [193, 196,...PochardPóiseard cíordheargAythya ferinaDucks
137[[[21, 19, 20], [21, 19, 20], [21, 19, 20], [2...Red-breasted MerganserSíolta ruaMergus serratorDucks
149[[[89, 88, 86], [89, 88, 86], [90, 89, 87], [9...Ring-necked DuckLacha mhuinceachAythya collarisDucks
156[[[65, 72, 38], [65, 72, 38], [65, 72, 38], [6...Ruddy DuckLacha ruaOxyura jamaicensisDucks
162[[[171, 161, 102], [167, 157, 98], [164, 152, ...ScaupLacha iascánAnas marilaDucks
165[[[62, 85, 116], [61, 84, 115], [61, 84, 115],...ShelduckSeil-lachaTadorna tadornaDucks
167[[[105, 117, 129], [105, 117, 129], [105, 117,...ShovelerSpadalachAnas clypeataDucks
171[[[88, 109, 136], [87, 108, 135], [87, 108, 13...SmewSíolta ghealMergellus albellusDucks
185[[[81, 107, 142], [86, 112, 147], [91, 117, 15...Surf ScoterScótar toinneMellanitta perspicillataDucks
188[[[204, 208, 217], [204, 208, 217], [204, 208,...TealPraslachaAnas creccaDucks
191[[[159, 159, 157], [159, 159, 157], [159, 159,...Tufted DuckLacha bhadánachAythya fuligulaDucks
195[[[131, 124, 105], [139, 132, 114], [117, 110,...Velvet ScoterSceadachMellanitta fuscaDucks
204[[[51, 48, 39], [46, 43, 34], [42, 39, 30], [4...WigeonRualachaAnas penelopeDucks
# Fixing 3rd issue 'Anas marila'
birdwatch_copy['Scientific_Name'] = birdwatch_copy['Scientific_Name'].replace('Anas marila','Aythya marila')
final_df = wild_birds_copy.join(birdwatch_copy.set_index('Scientific_Name'), on='Scientific_Name', lsuffix='_original', rsuffix='_bwi')

final_df.to_pickle('./data/bird-flu.pkl')

Ordnance Survey Ireland (OSi) - Ireland’s National Mapping Agency

Ireland’s National Mapping AgencyThe spacial data is provided by Ordnance Survey Ireland (OSi) under Creative Commons licence.

Ordnance Survey Ireland has evolved from the Ordnance Survey Office which was established in 1824, later becoming a state body under the Ordnance Survey Ireland Act 2001. Under this Act, Ordnance Survey Ireland continued its mainstream public service function of creating and maintaining the definitive mapping records of the State and also assumed the commercial function assigned to it under the Act of developing its commercial business and sales revenues.

Administrative Areas dataset generated from the 2019 OSi National Statutory Boundary dataset.

Dataset License: https://creativecommons.org/licenses/by/4.0/

url_geoJSON = 'https://opendata.arcgis.com/datasets/0d5984f732c54246bd087768223c92eb_0.geojson'
admin_areas_json = 'data/Administrative_Areas_Ireland.json'
# GeoJSON API
admin_areas = gpd.read_file(url_geoJSON, driver='GeoJSON')

Adding Count of bird flu occurences on each Administrative Area

avian_flu = wild_birds.copy()
avian_flu['geometry'] = None

for index, row in avian_flu.iterrows():
    avian_flu.loc[index, 'geometry'] = Point(row.Longitude, row.Latitude)

Coordinate Reference System (CRS): Setting a projection with Spatial Reference EPSG Code

gdf_infected_birds = gpd.GeoDataFrame(avian_flu, geometry='geometry').set_crs(epsg=29902, inplace=True)

On the dataset with birds’ information we have only Latitude and Longitude, so first I convert them in geometry Points to use later on a polygon operation. After that, a loop is created and for every point an intersect operation is done to check if the Point belongs to that Polygon (Administrative Area).

# adding Count of bird flu occurences on each Administrative Area
for index, area in admin_areas.iterrows():
    
    count_infected_birds = len(gdf_infected_birds[(gdf_infected_birds['target_H5_HPAI'] == 1) & (gdf_infected_birds.intersects(area.geometry)) ])
    count_healthy_birds = len(gdf_infected_birds[(gdf_infected_birds['target_H5_HPAI'] == 0) & (gdf_infected_birds.intersects(area.geometry)) ])
    total_birds = count_healthy_birds + count_infected_birds
    
    admin_areas.loc[index, 'TOTAL_BIRDS'] = total_birds
    admin_areas.loc[index, 'HEALTHY_BIRDS'] = count_healthy_birds
    admin_areas.loc[index, 'INFECTED_BIRDS'] = count_infected_birds
admin_areas.head()
ENGLISHGAEILGECONTAECOUNTYPROVINCEGUIDCENTROID_XCENTROID_YAREACC_IDOBJECTIDShape__AreaShape__LengthgeometryTOTAL_BIRDSHEALTHY_BIRDSINFECTED_BIRDS
0DUBLIN CITY COUNCILNoneBaile Átha CliathDUBLINLeinster2ae19629-1433-13a3-e055-000000000001716469.75735272.061.283502e+0826501111.283502e+08101493.212412POLYGON ((-6.38258 53.33367, -6.38261 53.33370...2161.01642.0519.0
1CORK CITY COUNCILNoneCorcaighCORKMunster2ae19629-1434-13a3-e055-000000000001565833.13571933.831.865976e+084551121.865976e+0880293.730785POLYGON ((-8.38436 51.90533, -8.38425 51.90529...223.0182.041.0
2GALWAY CITY COUNCILNoneGaillimhGALWAYConnacht2ae19629-1435-13a3-e055-000000000001530067.66726500.525.069505e+076501135.069505e+0764020.725628MULTIPOLYGON (((-9.13605 53.26682, -9.13606 53...989.0730.0259.0
3OFFALY COUNTY COUNCILNoneUíbh FhailíOFFALYLeinster2ae19629-1496-13a3-e055-000000000001631261.72709672.352.000025e+0918500142.000025e+09389927.708615POLYGON ((-7.97902 53.33689, -7.97878 53.33684...76.069.07.0
4WICKLOW COUNTY COUNCILNoneCill MhantáinWICKLOWLeinster2ae19629-149e-13a3-e055-000000000001707784.79690738.102.025161e+0925500152.025161e+09320629.958733MULTIPOLYGON (((-6.14602 52.78372, -6.14607 52...1231.01062.0169.0


# Saving the data downloaded to a local json file
admin_areas.to_file(admin_areas_json, driver='GeoJSON')

« BACK