COVID-19 Exploratory Data Analysis

Adapted from analyses by Devakumar kp

(Almost) Everything You Want To Know About COVID-19.

These visualizations were made by Devakumar kp. Original notebook is here.

#hide
# essential libraries
import json
import random
from urllib.request import urlopen
# storing and anaysis
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
import folium
# color pallette
cnf = '#393e46' # confirmed - grey
dth = '#ff2e63' # death - red
rec = '#21bf73' # recovered - cyan
act = '#fe9801' # active case - yellow
# converter
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()   
# hide warnings
import warnings
warnings.filterwarnings('ignore')
# html embedding
from IPython.display import Javascript
from IPython.core.display import display, HTML
#hide
# importing datasets
url = 'https://raw.githubusercontent.com/imdevskp/covid_19_jhu_data_web_scrap_and_cleaning/master/covid_19_clean_complete.csv'
full_table = pd.read_csv(url, 
                         parse_dates=['Date'])
full_table.head()
#hide
# cases 
cases = ['Confirmed', 'Deaths', 'Recovered', 'Active']
# Active Case = confirmed - deaths - recovered
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']
# replacing Mainland china with just China
full_table['Country/Region'] = full_table['Country/Region'].replace('Mainland China', 'China')
# filling missing values 
full_table[['Province/State']] = full_table[['Province/State']].fillna('')
full_table[cases] = full_table[cases].fillna(0)
#hide
# cases in the ships
ship = full_table[full_table['Province/State'].str.contains('Grand Princess')|full_table['Province/State'].str.contains('Diamond Princess cruise ship')]
# china and the row
china = full_table[full_table['Country/Region']=='China']
row = full_table[full_table['Country/Region']!='China']
# latest
full_latest = full_table[full_table['Date'] == max(full_table['Date'])].reset_index()
china_latest = full_latest[full_latest['Country/Region']=='China']
row_latest = full_latest[full_latest['Country/Region']!='China']
# latest condensed
full_latest_grouped = full_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
china_latest_grouped = china_latest.groupby('Province/State')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
row_latest_grouped = row_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()

World-Wide Totals

#hide
temp = full_table.groupby(['Country/Region', 'Province/State'])['Confirmed', 'Deaths', 'Recovered', 'Active'].max()
# temp.style.background_gradient(cmap='Reds')
#hide_input
temp = full_table.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
temp.style.background_gradient(cmap='Pastel1')

Progression of Virus Over Time

#hide_input
# https://app.flourish.studio/visualisation/1571387/edit
HTML('''<div class="flourish-embed flourish-bar-chart-race" data-src="visualisation/1571387"><script src="https://public.flourish.studio/resources/embed.js"></script></div>''')

Cumalitive Outcomes

#hide
temp = full_table.groupby('Date')['Recovered', 'Deaths', 'Active'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()
fig = px.area(temp, x="Date", y="Count", color='Case',
             title='Cases over time', color_discrete_sequence = [rec, dth, act])
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-2-1.png')
Loading viewer…

Recovery and Mortality Rate

#hide
temp = full_table.groupby('Date').sum().reset_index()
# adding two more columns
temp['No. of Deaths to 100 Confirmed Cases'] = round(temp['Deaths']/temp['Confirmed'], 3)*100
temp['No. of Recovered to 100 Confirmed Cases'] = round(temp['Recovered']/temp['Confirmed'], 3)*100
# temp['No. of Recovered to 1 Death Case'] = round(temp['Recovered']/temp['Deaths'], 3)
temp = temp.melt(id_vars='Date', value_vars=['No. of Deaths to 100 Confirmed Cases', 'No. of Recovered to 100 Confirmed Cases'], 
                 var_name='Ratio', value_name='Value')
fig = px.line(temp, x="Date", y="Value", color='Ratio', log_y=True, 
              title='Recovery and Mortality Rate Over The Time',
              color_discrete_sequence=[dth, rec])
fig.update_layout(legend=dict(orientation="h", y=1, x=0, 
                              xanchor="left", yanchor="top"),
                  margin=dict(t=80,l=0,r=0,b=0))
fig
Loading viewer…

No. of Places To Which COVID-19 spread

#hide
c_spread = china[china['Confirmed']!=0].groupby('Date')['Province/State'].unique().apply(len)
c_spread = pd.DataFrame(c_spread).reset_index()
fig = px.line(c_spread, x='Date', y='Province/State', text='Province/State',
              title='Number of Provinces/States/Regions of China<br>to which COVID-19 spread over the time',
             color_discrete_sequence=[cnf,dth, rec])
fig.update_traces(textposition='top center')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-3-1.png')
Loading viewer…
spread = full_table[full_table['Confirmed']!=0].groupby('Date')['Country/Region'].unique().apply(len)
spread = pd.DataFrame(spread).reset_index()
fig = px.line(spread, x='Date', y='Country/Region', text='Country/Region',
              title='Number of Countries/Regions<br>to which COVID-19 spread over the time',
             color_discrete_sequence=[cnf,dth, rec])
fig.update_traces(textposition='top center')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-3-2.png')
Loading viewer…

Maps

#hide
# Confirmed
fig = px.choropleth(full_latest_grouped, locations="Country/Region", 
                    locationmode='country names', color="Confirmed", 
                    hover_name="Country/Region", range_color=[1,7000], 
                    color_continuous_scale="aggrnyl", 
                    title='Countries with Confirmed Cases')
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-1-1.png')
Loading viewer…
#hide
# Deaths
fig = px.choropleth(full_latest_grouped[full_latest_grouped['Deaths']>0], 
                    locations="Country/Region", locationmode='country names',
                    color="Deaths", hover_name="Country/Region", 
                    range_color=[1,50], color_continuous_scale="agsunset",
                    title='Countries with Deaths Reported')
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-1-2.png')
Loading viewer…

Top 20 Countries

#hide
flg = full_latest_grouped
flg.head()
#hide
fig = px.bar(flg.sort_values('Confirmed', ascending=False).head(20).sort_values('Confirmed', ascending=True), 
             x="Confirmed", y="Country/Region", title='Confirmed Cases', text='Confirmed', orientation='h', 
             width=700, height=700, range_x = [0, max(flg['Confirmed'])+10000])
fig.update_traces(marker_color=cnf, opacity=0.6, textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-4-1.png')
Loading viewer…
#hide
fig = px.bar(flg.sort_values('Deaths', ascending=False).head(20).sort_values('Deaths', ascending=True), 
             x="Deaths", y="Country/Region", title='Deaths', text='Deaths', orientation='h', 
             width=700, height=700, range_x = [0, max(flg['Deaths'])+500])
fig.update_traces(marker_color=dth, opacity=0.6, textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-4-2.png')
Loading viewer…
#hide
fig = px.bar(flg.sort_values('Recovered', ascending=False).head(20).sort_values('Recovered', ascending=True), 
             x="Recovered", y="Country/Region", title='Recovered', text='Recovered', orientation='h', 
             width=700, height=700, range_x = [0, max(flg['Recovered'])+10000])
fig.update_traces(marker_color=rec, opacity=0.6, textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-4-3.png')
Loading viewer…
#hide
fig = px.bar(flg.sort_values('Active', ascending=False).head(20).sort_values('Active', ascending=True), 
             x="Active", y="Country/Region", title='Active', text='Active', orientation='h', 
             width=700, height=700, range_x = [0, max(flg['Active'])+3000])
fig.update_traces(marker_color=act, opacity=0.6, textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-4-4.png')
Loading viewer…
#hide
# (Only countries with more than 100 case are considered)
flg['Mortality Rate'] = round((flg['Deaths']/flg['Confirmed'])*100, 2)
temp = flg[flg['Confirmed']>100]
temp = temp.sort_values('Mortality Rate', ascending=False)
fig = px.bar(temp.sort_values('Mortality Rate', ascending=False).head(15).sort_values('Mortality Rate', ascending=True), 
             x="Mortality Rate", y="Country/Region", text='Mortality Rate', orientation='h', 
             width=700, height=600, range_x = [0, 8], title='No. of Deaths Per 100 Confirmed Case')
fig.update_traces(marker_color=act, opacity=0.6, textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-4-5.png')
Loading viewer…

Composition of Cases

#hide_input
fig = px.treemap(full_latest.sort_values(by='Confirmed', ascending=False).reset_index(drop=True), 
                 path=["Country/Region", "Province/State"], values="Confirmed", height=700,
                 title='Number of Confirmed Cases',
                 color_discrete_sequence = px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-8-1.png')
Loading viewer…
fig = px.treemap(full_latest.sort_values(by='Deaths', ascending=False).reset_index(drop=True), 
                 path=["Country/Region", "Province/State"], values="Deaths", height=700,
                 title='Number of Deaths reported',
                 color_discrete_sequence = px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-8-2.png')
Loading viewer…

Epidemic Span

Note : In the graph, last day is shown as one day after the last time a new confirmed cases reported in the Country / Region

#hide_input
# first date
# ----------
first_date = full_table[full_table['Confirmed']>0]
first_date = first_date.groupby('Country/Region')['Date'].agg(['min']).reset_index()
# first_date.head()
from datetime import timedelta  
# last date
# ---------
last_date = full_table.groupby(['Country/Region', 'Date', ])['Confirmed', 'Deaths', 'Recovered']
last_date = last_date.sum().diff().reset_index()
mask = last_date['Country/Region'] != last_date['Country/Region'].shift(1)
last_date.loc[mask, 'Confirmed'] = np.nan
last_date.loc[mask, 'Deaths'] = np.nan
last_date.loc[mask, 'Recovered'] = np.nan
last_date = last_date[last_date['Confirmed']>0]
last_date = last_date.groupby('Country/Region')['Date'].agg(['max']).reset_index()
# last_date.head()
# first_last
# ----------
first_last = pd.concat([first_date, last_date[['max']]], axis=1)
# added 1 more day, which will show the next day as the day on which last case appeared
first_last['max'] = first_last['max'] + timedelta(days=1)
# no. of days
first_last['Days'] = first_last['max'] - first_last['min']
# task column as country
first_last['Task'] = first_last['Country/Region']
# rename columns
first_last.columns = ['Country/Region', 'Start', 'Finish', 'Days', 'Task']
# sort by no. of days
first_last = first_last.sort_values('Days')
# first_last.head()
# visualization
# --------------
# produce random colors
clr = ["#"+''.join([random.choice('0123456789ABC') for j in range(6)]) for i in range(len(first_last))]
#plot
fig = ff.create_gantt(first_last, index_col='Country/Region', colors=clr, 
                      show_colorbar=False, bar_width=0.2, showgrid_x=True, 
                      showgrid_y=True, height=1600, title=('Gantt Chart'))
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0),
                  autosize=False,width=700,height=5000)
fig #.write_image('covid-eda-9-1.png')
Loading viewer…

China vs. Not China

#hide
# In China
temp = china.groupby('Date')['Confirmed', 'Deaths', 'Recovered'].sum().diff()
temp = temp.reset_index()
temp = temp.melt(id_vars="Date", 
                 value_vars=['Confirmed', 'Deaths', 'Recovered'])
fig = px.bar(temp, x="Date", y="value", color='variable', 
             title='In China',
             color_discrete_sequence=[cnf, dth, rec])
fig.update_layout(barmode='group')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-1.png')
Loading viewer…
# ROW
temp = row.groupby('Date')['Confirmed', 'Deaths', 'Recovered'].sum().diff()
temp = temp.reset_index()
temp = temp.melt(id_vars="Date", 
                 value_vars=['Confirmed', 'Deaths', 'Recovered'])
fig = px.bar(temp, x="Date", y="value", color='variable', 
             title='Outside China',
             color_discrete_sequence=[cnf, dth, rec])
fig.update_layout(barmode='group')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-2.png')
Loading viewer…
#hide
def from_china_or_not(row):
    if row['Country/Region']=='China':
        return 'From China'
    else:
        return 'Outside China'
    
temp = full_table.copy()
temp['Region'] = temp.apply(from_china_or_not, axis=1)
temp = temp.groupby(['Region', 'Date'])['Confirmed', 'Deaths', 'Recovered']
temp = temp.sum().diff().reset_index()
mask = temp['Region'] != temp['Region'].shift(1)
temp.loc[mask, 'Confirmed'] = np.nan
temp.loc[mask, 'Deaths'] = np.nan
temp.loc[mask, 'Recovered'] = np.nan
fig = px.bar(temp, x='Date', y='Confirmed', color='Region', barmode='group', 
             text='Confirmed', title='Confirmed', color_discrete_sequence= [cnf, dth, rec])
fig.update_traces(textposition='outside')
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-3.png')
Loading viewer…
fig = px.bar(temp, x='Date', y='Deaths', color='Region', barmode='group', 
             text='Confirmed', title='Deaths', color_discrete_sequence= [cnf, dth, rec])
fig.update_traces(textposition='outside')
fig.update_traces(textangle=-90)
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-4.png')
Loading viewer…
#hide
gdf = full_table.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths', 'Recovered'].max()
gdf = gdf.reset_index()
temp = gdf[gdf['Country/Region']=='China'].reset_index()
temp = temp.melt(id_vars='Date', value_vars=['Confirmed', 'Deaths', 'Recovered'],
                var_name='Case', value_name='Count')
fig = px.bar(temp, x="Date", y="Count", color='Case', facet_col="Case",
            title='China', color_discrete_sequence=[cnf, dth, rec])
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-5.png')
Loading viewer…
temp = gdf[gdf['Country/Region']!='China'].groupby('Date').sum().reset_index()
temp = temp.melt(id_vars='Date', value_vars=['Confirmed', 'Deaths', 'Recovered'],
                var_name='Case', value_name='Count')
fig = px.bar(temp, x="Date", y="Count", color='Case', facet_col="Case",
             title='ROW', color_discrete_sequence=[cnf, dth, rec])
fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))
fig #.write_image('covid-eda-10-6.png')
Loading viewer…

Data By Country

Top 50 Countries By Confirmed Cases

#hide_input
temp_f = full_latest_grouped.sort_values(by='Confirmed', ascending=False).head(50)
temp_f = temp_f.reset_index(drop=True)
temp_f.style.background_gradient(cmap='Reds')

Top 25 Countries By Deaths Reported

#hide_input
temp_flg = temp_f[temp_f['Deaths']>0][['Country/Region', 'Deaths']].head(25)
temp_flg.sort_values('Deaths', ascending=False).reset_index(drop=True).style.background_gradient(cmap='Reds')

Top 25 Chinese Provinces By Confirmed Cases

#hide_input
temp_f = china_latest_grouped[['Province/State', 'Confirmed', 'Deaths', 'Recovered']]
temp_f = temp_f.sort_values(by='Confirmed', ascending=False)
temp_f = temp_f.reset_index(drop=True)
temp_f.style.background_gradient(cmap='Pastel1_r')

Related Work

  1. https://www.kaggle.com/imdevskp/mers-outbreak-analysis

  2. https://www.kaggle.com/imdevskp/sars-2003-outbreak-analysis

  3. https://www.kaggle.com/imdevskp/western-africa-ebola-outbreak-analysis

Runtimes (1)