In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 12 16:28:46 2020

@author: dave
"""
import pandas as pd

#https://ourworldindata.org/coronavirus
covid_data = pd.read_csv('owid-covid-data.csv', 
                         parse_dates=['date'],
                         usecols=['iso_code','continent','location',
                                  'date','total_cases_per_million', 'total_deaths_per_million', 
                                  'total_tests_per_thousand','stringency_index', 'population', 'population_density' ])

#https://ourworldindata.org/urbanization#share-of-populations-living-in-urban-areas
urbanization = pd.read_csv('share-of-population-urban.csv',
                    names=['iso_code', 'Year', 'urbanization'],
                    header=0)
urbanization.reset_index(drop=True, inplace=True)

# get rid of countries with unusable iso_codes
covid_data.dropna(subset=['iso_code', 'population_density'], inplace=True)
covid_data = covid_data[covid_data['iso_code']!='OWID_WRL']
covid_data = covid_data[covid_data['iso_code']!='OWID_KOS']
covid_data = covid_data[covid_data['iso_code']!='']

# for each country pick the row with the most 
# recent date. covid_data contains all the data,
# country_list is the most recent subset of covid data
country_groups = covid_data.groupby('location')
row_indeces = country_groups['date'].idxmax()
row_indeces = row_indeces.values

# indeces from country_groups
country_list = covid_data.loc[row_indeces]

#print('country_list: \n', country_list.iloc[1])



# do the same for urbanization 
urbanizationization_groups = urbanization.groupby('iso_code')
row_indeces = urbanizationization_groups['Year'].idxmax()
row_indeces = row_indeces.values
urbanization_list = urbanization.loc[row_indeces]
urbanization_list = urbanization_list.drop(columns=['Year'])
urbanization_list = urbanization_list[urbanization_list['iso_code']!='OWID_WRL']
urbanization_list = urbanization_list[urbanization_list['iso_code']!='OWID_CIS']

# 
country_list = country_list.merge(urbanization_list, 
                                  on=['iso_code'], 
                                  how='inner')

# sorted ascending index
country_list = country_list[country_list['total_deaths_per_million']!=0]
country_list = country_list.dropna(axis='index', subset=['total_deaths_per_million'])


#print(cimport numpy as npountry_list[['iso_code','total_deaths_per_million'] ])#['total_deaths_per_million'])  #.head(5))  #['iso_code']['total_deaths_per_million'])

# https://matplotlib.org/3.2.2/gallery/mplot3d/surface3d.html#sphx-glr-gallery-mplot3d-surface3d-py
# https://matplotlib.org/3.2.2/gallery/color/colorbar_basics.html#sphx-glr-gallery-color-colorbar-basics-py

import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(2, 1)
#plt.figure(figsize=(30,20))
plt.subplots_adjust(hspace=0.4)

ax1.set_ylabel('population density')
ax1.set_xlabel('total_deaths_per_million')

country_list.sort_values(axis=0, 
                         by='total_deaths_per_million', 
                         inplace=True,
                         ascending = False,
                         ignore_index=True)


# label the deadliest countries and set up colors (country_list is sorted in
# descending order on total_deaths_per_million)

rows, columns = country_list.shape
max_index = rows - 1

# note that color1 will presume that the data is sorted on
# total_deaths_per_million, descending
#colors1 = ['royalblue' for i in range(rows)]
country_list['colors'] = ['royalblue' for i in range(rows)]
country_list['alpha'] = [0.4 for i in range(rows)]

# get column number for 'colors'
colors_loc = country_list.columns.get_loc('colors')
alpha_loc = country_list.columns.get_loc('alpha')
for i in range(3):
    country_list.iloc[i,colors_loc] = 'red'
    country_list.iloc[i,alpha_loc] = 1.0
    country=country_list.iloc[i,2]
    xy1 = (float(country_list['total_deaths_per_million'].iloc[i]),
        float(country_list['population_density'].iloc[i]))  
    xy2 = (float(country_list['total_deaths_per_million'].iloc[i]), 
           float(country_list['urbanization'].iloc[i]))
    #print(country, xy1, xy2)
    ax1.annotate(s= country,  
                 xy=xy1, 
                 xytext =(xy1[0]-80, xy1[1] + 5000),
                 arrowprops={'arrowstyle':'->'})
    ax2.annotate(s= country, xy=xy2,
                 xytext =(xy2[0]-80, xy2[1] - 30),
                 arrowprops={'arrowstyle':'->'})

# label the least deadly (country_list is sorted in
# descending order on total_deaths_per_million)

displacement = 0.5
for i in range(max_index, max_index-2, -1):
    country_list.iloc[i,colors_loc] = 'red'
    country_list.iloc[i,alpha_loc] = 1.0
    country=country_list.iloc[i,2]
    xy1 = (float(country_list['total_deaths_per_million'].iloc[i]),
        float(country_list['population_density'].iloc[i]))  
    xy2 = (float(country_list['total_deaths_per_million'].iloc[i]), 
           float(country_list['urbanization'].iloc[i]))
    #print(country, xy1, xy2)
    ax1.annotate(s= country,  
                 xy=xy1,
                 xytext =(xy1[0]+200 - 50*displacement, xy1[1] + displacement*4000),
                 arrowprops={'arrowstyle':'->'})
    ax2.annotate(s= country, xy=xy2,
                 xytext =(xy2[0]+100, xy2[1] + 10),
                 arrowprops={'arrowstyle':'->'})
    displacement = displacement+1


# annotate the highest population density points in ax1
country_list.sort_values(axis=0, 
                         by='population_density', 
                         inplace=True,
                         ascending = False)
for i in range(2):
    country_list.iloc[i,alpha_loc] = 1.0
    xy1 = (float(country_list['total_deaths_per_million'].iloc[i]),
           float(country_list['population_density'].iloc[i]))  
    country=country_list.iloc[i,2]
    ax1.annotate(s= country_list.iloc[i,2],  
                     xy=xy1,
                     xytext=(xy1[0]+50 , xy1[1]-5000 + i*8000),
                     arrowprops={'arrowstyle':'->'})
    country_list.iloc[i,colors_loc] = 'red'
    xy2 = (float(country_list['total_deaths_per_million'].iloc[i]),
           float(country_list['urbanization'].iloc[i]))
    ax2.annotate(s= country, xy=xy2,
             xytext =(xy2[0] + 150, xy2[1]-10-i*15),
             arrowprops={'arrowstyle':'->'})


#fig, (ax1, ax2, ax3) = plt.subplots(figsize=(13, 3), ncols=3)
#plt.scatter(country_list['population_density'], country_list['total_deaths_per_million'])
#ax1.hexbin(country_list['population_density'], country_list['total_deaths_per_million'],
#          gridsize=(10,10))
plt.suptitle('Urbanization vs. Population Density in Driving COVID Death Rate')

#ax2.hexbin(country_list['population_density'], 
#           country_list['urbanization population (% of total) (% of total)'],
#          gridsize=(10,10))
ax2.set_xlabel('total_deaths_per_million')
ax2.set_ylabel('urbanization population \n (% of total)')


colors_loc = country_list.columns.get_loc('colors')
alpha_loc = country_list.columns.get_loc('alpha')
total_loc = country_list.columns.get_loc('total_deaths_per_million')
urban_loc = country_list.columns.get_loc('urbanization')
pop_loc = country_list.columns.get_loc('population_density')
for i in range(max_index):
    ax1.scatter(
                country_list.iloc[i, total_loc],
                country_list.iloc[i, pop_loc], 
                marker='.', c=country_list.iloc[i, colors_loc],
                alpha=country_list.iloc[i, alpha_loc])
    ax2.scatter(country_list.iloc[i, total_loc],
                country_list.iloc[i, urban_loc],
                marker='.', c=country_list.iloc[i, colors_loc],
                alpha=country_list.iloc[i, alpha_loc])