import geopandas as gpd
import json
from pathlib import Path
from shapely.geometry import Point
from shapely.geometry import Polygon

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import urllib
from urllib import parse
from skimage import io

BirdWatch Ireland - Species & Habitat Conservation

BirdWatch IrelandBirdWatch Ireland is the largest independent conservation organisation in Ireland and their objective is the protection of wild birds and their habitats. They have been doing an incredible work protecting birds and biodiversity in Ireland. Check out their amazing work here.

The data is collected using Web Scraping technique from BirdWatch Ireland’s website. There is a list of Ireland’s birds with detailed information on every species.

To create our dataset and merge with the data provided by the Department of Agriculture, Food and the Marine, the focus will be on the bird’s image and common name.

Note: scikit-image imread: OpenCV represents images in BGR order, whereas scikit-image represents images in RGB order. To utilize OpenCV functions after downloading the image, there is a extra step which is to convert the image from RGB to BGR.

# div class birds-with-filters (Parent)
# page/2/
no_pages = 24

def get_data(pageNo):  
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) 
    Gecko/20100101 Firefox/66.0", 
    "Accept-Encoding":"gzip, deflate", 
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    r = requests.get(''+str(pageNo), headers=headers)#, proxies=proxies)
    content = r.content
    soup = BeautifulSoup(content)

    alls = []
    for d in soup.findAll('article', attrs={'class':'bird'}):
        # image
        image_div = d.find('div', attrs={'class':'bird-img'})
        img_html = image_div.find('img')

        # text
        bird_name = d.find('h3', attrs={'class':'title'})

        complementary_info = d.find('div', attrs={'class':'bird-info'})
        bird_info = complementary_info.find_all('p')
        irish_name = bird_info[0]
        scientific_name = bird_info[1]
        bird_family = bird_info[2]


        if img_html is not None:
            str_image = img_html['data-src']

            # there is a URL on Page 20 with an accent 
            # Snow Goose -é-Bruun.jpg
            # for that reason it needs to parse.quote() ignoring : and /, otherwise it will throw an error

            image = io.imread(urllib.parse.quote(str_image, safe=':/'))

        if bird_name is not None:
            all1.append(bird_name.text) if bird_name.text != '' else all1.append(np.nan)

        if irish_name is not None:
            all1.append(irish_name.text) if irish_name.text != '' else all1.append(np.nan)

        if scientific_name is not None:
            all1.append(scientific_name.text) if scientific_name.text != '' else all1.append(np.nan)

        if bird_family is not None:
            all1.append(bird_family.text) if bird_family.text != '' else all1.append(np.nan)


    return alls
results = []
# for i in range(1, no_pages+1):
for i in range(1, no_pages+1):
    url_to_append = "page/{}/".format(i)
    print(url_to_append+': OK')

flatten = lambda l: [item for sublist in l for item in sublist]
data_webscraping = flatten(results)
df_birds = pd.DataFrame(data_webscraping, columns=['Image','Bird_Name','Irish_Name','Scientific_Name','Bird_Family'])
#df_birds.to_csv('./data/BirdWatchIreland.csv', index=False)

H5N1 Wild Bird Species Identification

Department of Agriculture, Food and the Marine

Dataset provided by Ireland’s Department of Agriculture, Food and the Marine which contains the locations of bird species captured in Ireland from 1980-09-01 to 2020-01-27 and wild birds that are targeted for the H5N1 strain of avian flu.

wild_birds = pd.read_csv("./data/98696_58589762-e8f9-4bb0-9d39-09570efbad62.xls", encoding='latin-1')
birdwatch = pd.read_pickle('./data/BirdWatchIreland.pkl')
JOIN the two bird datasets to link image to the specie. Using Scientific Name as key column

bird_flu = wild_birds.join(birdwatch.set_index('Scientific_Name'), on='Scientific_Name', lsuffix='_original', rsuffix='_bwi')
# Selecting only infected birds
infected_birds = bird_flu[bird_flu['target_H5_HPAI'] == 1]
top_infected_species = infected_birds.groupby('Scientific_Name').size().sort_values(ascending=False)
infected_birds_new = top_infected_species.to_frame().join(birdwatch.set_index('Scientific_Name'), on='Scientific_Name', lsuffix='_original', rsuffix='_bwi')
There are 3 species among those classified as Infected Birds which do not have image. Something might be different on those two datasets. Let’s investigate.

Chroicocephalus ridibundus: have the same Common Name Black-headed Gull on both dataset but on BirdWatch Ireland’s dataset the Scientific Name is Larus ridibundus. I will switch the Scientific name so that the Join can work.

Branta bernicla: There are three subspecies of Brant (or Brent) Goose.

A fourth Brent Goose population has been recorded in Ireland, though its taxonomic status remains uncertain and it has no scientific name. It is generally known colloquially as ‘Grey-bellied Brant’.

Department of Agriculture, Food and the Marine dataset

Black-bellied Brant (nigricans) is very similar to Light-bellied Brent Goose (hrota) and care is needed to distinguish the two species. Brent Goose (Dark-bellied) and Black Brant are a rare winter visitor. As Brent Goose (Light-bellied) is the most common species in Ireland and it is easily mistaken by Black-bellied Brant (nigricans), I will combine Pale-bellied Brant and Black-bellied Brant.


Aythya marila: Greater Scaup from the Duck family is under the Scientific Name Anas marila on BirdWatch Ireland’s dataset.

wild_birds_copy = wild_birds.copy()
birdwatch_copy = birdwatch.copy()
# Fixing 1st issue 'Chroicocephalus ridibundus'
birdwatch_copy['Scientific_Name'] = birdwatch_copy['Scientific_Name'].replace('Larus ridibundus','Chroicocephalus ridibundus')
# Different Brent Geese species
wild_birds_copy[wild_birds_copy['Scientific_Name'].str.startswith('Branta bernicla')].drop_duplicates(subset='Common_Name').drop(['Year', 'Month','Day','Time','Country','Country_State_County','State','Latitude','Longitude'], axis='columns')
#infected_birds = bird_flu[bird_flu['target_H5_HPAI'] == 1]
top_infected_species = wild_birds_copy.groupby('Scientific_Name').size().sort_values(ascending=False)
# Fixing 2nd issue 'Branta bernicla'
wild_birds_copy['Scientific_Name'] = wild_birds_copy['Scientific_Name'].replace(['Branta bernicla'],'Branta bernicla hrota')
birdwatch_copy[birdwatch_copy['Bird_Family'] == 'Ducks']
# Fixing 3rd issue 'Anas marila'
birdwatch_copy['Scientific_Name'] = birdwatch_copy['Scientific_Name'].replace('Anas marila','Aythya marila')
final_df = wild_birds_copy.join(birdwatch_copy.set_index('Scientific_Name'), on='Scientific_Name', lsuffix='_original', rsuffix='_bwi')


Ordnance Survey Ireland (OSi) - Ireland’s National Mapping Agency

Ireland’s National Mapping AgencyThe spacial data is provided by Ordnance Survey Ireland (OSi) under Creative Commons licence.

Ordnance Survey Ireland has evolved from the Ordnance Survey Office which was established in 1824, later becoming a state body under the Ordnance Survey Ireland Act 2001. Under this Act, Ordnance Survey Ireland continued its mainstream public service function of creating and maintaining the definitive mapping records of the State and also assumed the commercial function assigned to it under the Act of developing its commercial business and sales revenues.

Administrative Areas dataset generated from the 2019 OSi National Statutory Boundary dataset.

Dataset License:

url_geoJSON = ''
admin_areas_json = 'data/Administrative_Areas_Ireland.json'
admin_areas = gpd.read_file(url_geoJSON, driver='GeoJSON')

Adding Count of bird flu occurences on each Administrative Area

avian_flu = wild_birds.copy()
avian_flu['geometry'] = None

for index, row in avian_flu.iterrows():
    avian_flu.loc[index, 'geometry'] = Point(row.Longitude, row.Latitude)

Coordinate Reference System (CRS): Setting a projection with Spatial Reference EPSG Code

gdf_infected_birds = gpd.GeoDataFrame(avian_flu, geometry='geometry').set_crs(epsg=29902, inplace=True)

On the dataset with birds’ information we have only Latitude and Longitude, so first I convert them in geometry Points to use later on a polygon operation. After that, a loop is created and for every point an intersect operation is done to check if the Point belongs to that Polygon (Administrative Area).

# adding Count of bird flu occurences on each Administrative Area
for index, area in admin_areas.iterrows():
    count_infected_birds = len(gdf_infected_birds[(gdf_infected_birds['target_H5_HPAI'] == 1) & (gdf_infected_birds.intersects(area.geometry)) ])
    count_healthy_birds = len(gdf_infected_birds[(gdf_infected_birds['target_H5_HPAI'] == 0) & (gdf_infected_birds.intersects(area.geometry)) ])
    total_birds = count_healthy_birds + count_infected_birds
    admin_areas.loc[index, 'TOTAL_BIRDS'] = total_birds
    admin_areas.loc[index, 'HEALTHY_BIRDS'] = count_healthy_birds
    admin_areas.loc[index, 'INFECTED_BIRDS'] = count_infected_birds
# Saving the data downloaded to a local json file
admin_areas.to_file(admin_areas_json, driver='GeoJSON')