Micah P. Dombrowski / Mar 18 2020
COVID-19 Exploratory Data Analysis
(Almost) Everything You Want To Know About COVID-19.
These visualizations were made by Devakumar kp. Original notebook is here.
#hide# essential librariesimport jsonimport randomfrom urllib.request import urlopen# storing and anaysisimport numpy as npimport pandas as pd# visualizationimport matplotlib.pyplot as pltimport seaborn as snsimport plotly.express as pximport plotly.graph_objs as goimport plotly.figure_factory as ffimport folium# color pallettecnf = '#393e46' # confirmed - greydth = '#ff2e63' # death - redrec = '#21bf73' # recovered - cyanact = '#fe9801' # active case - yellow# converterfrom pandas.plotting import register_matplotlib_convertersregister_matplotlib_converters()   # hide warningsimport warningswarnings.filterwarnings('ignore')# html embeddingfrom IPython.display import Javascriptfrom IPython.core.display import display, HTML8.0s
Python
#hide# importing datasetsurl = 'https://raw.githubusercontent.com/imdevskp/covid_19_jhu_data_web_scrap_and_cleaning/master/covid_19_clean_complete.csv'full_table = pd.read_csv(url,                          parse_dates=['Date'])full_table.head()0.5s
Python
#hide# cases cases = ['Confirmed', 'Deaths', 'Recovered', 'Active']# Active Case = confirmed - deaths - recoveredfull_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']# replacing Mainland china with just Chinafull_table['Country/Region'] = full_table['Country/Region'].replace('Mainland China', 'China')# filling missing values full_table[['Province/State']] = full_table[['Province/State']].fillna('')full_table[cases] = full_table[cases].fillna(0)0.1s
Python
#hide# cases in the shipsship = full_table[full_table['Province/State'].str.contains('Grand Princess')|full_table['Province/State'].str.contains('Diamond Princess cruise ship')]# china and the rowchina = full_table[full_table['Country/Region']=='China']row = full_table[full_table['Country/Region']!='China']# latestfull_latest = full_table[full_table['Date'] == max(full_table['Date'])].reset_index()china_latest = full_latest[full_latest['Country/Region']=='China']row_latest = full_latest[full_latest['Country/Region']!='China']# latest condensedfull_latest_grouped = full_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()china_latest_grouped = china_latest.groupby('Province/State')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()row_latest_grouped = row_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()0.3s
Python
World-Wide Totals
#hidetemp = full_table.groupby(['Country/Region', 'Province/State'])['Confirmed', 'Deaths', 'Recovered', 'Active'].max()# temp.style.background_gradient(cmap='Reds')0.1s
Python
#hide_inputtemp = full_table.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)temp.style.background_gradient(cmap='Pastel1')0.7s
Python
Progression of Virus Over Time
#hide_input# https://app.flourish.studio/visualisation/1571387/editHTML('''<div class="flourish-embed flourish-bar-chart-race" data-src="visualisation/1571387"><script src="https://public.flourish.studio/resources/embed.js"></script></div>''')0.1s
Python
Cumalitive Outcomes
#hidetemp = full_table.groupby('Date')['Recovered', 'Deaths', 'Active'].sum().reset_index()temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Active'],                 var_name='Case', value_name='Count')temp.head()fig = px.area(temp, x="Date", y="Count", color='Case',             title='Cases over time', color_discrete_sequence = [rec, dth, act])fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-2-1.png')3.3s
Python
Loading viewer…
Recovery and Mortality Rate
#hidetemp = full_table.groupby('Date').sum().reset_index()# adding two more columnstemp['No. of Deaths to 100 Confirmed Cases'] = round(temp['Deaths']/temp['Confirmed'], 3)*100temp['No. of Recovered to 100 Confirmed Cases'] = round(temp['Recovered']/temp['Confirmed'], 3)*100# temp['No. of Recovered to 1 Death Case'] = round(temp['Recovered']/temp['Deaths'], 3)temp = temp.melt(id_vars='Date', value_vars=['No. of Deaths to 100 Confirmed Cases', 'No. of Recovered to 100 Confirmed Cases'],                  var_name='Ratio', value_name='Value')fig = px.line(temp, x="Date", y="Value", color='Ratio', log_y=True,               title='Recovery and Mortality Rate Over The Time',              color_discrete_sequence=[dth, rec])fig.update_layout(legend=dict(orientation="h", y=1, x=0,                               xanchor="left", yanchor="top"),                  margin=dict(t=80,l=0,r=0,b=0))fig1.1s
Python
Loading viewer…
No. of Places To Which COVID-19 spread
#hidec_spread = china[china['Confirmed']!=0].groupby('Date')['Province/State'].unique().apply(len)c_spread = pd.DataFrame(c_spread).reset_index()fig = px.line(c_spread, x='Date', y='Province/State', text='Province/State',              title='Number of Provinces/States/Regions of China<br>to which COVID-19 spread over the time',             color_discrete_sequence=[cnf,dth, rec])fig.update_traces(textposition='top center')fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-3-1.png')1.3s
Python
Loading viewer…
spread = full_table[full_table['Confirmed']!=0].groupby('Date')['Country/Region'].unique().apply(len)spread = pd.DataFrame(spread).reset_index()fig = px.line(spread, x='Date', y='Country/Region', text='Country/Region',              title='Number of Countries/Regions<br>to which COVID-19 spread over the time',             color_discrete_sequence=[cnf,dth, rec])fig.update_traces(textposition='top center')fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-3-2.png')1.2s
Python
Loading viewer…
Maps
#hide# Confirmedfig = px.choropleth(full_latest_grouped, locations="Country/Region",                     locationmode='country names', color="Confirmed",                     hover_name="Country/Region", range_color=[1,7000],                     color_continuous_scale="aggrnyl",                     title='Countries with Confirmed Cases')fig.update(layout_coloraxis_showscale=False)fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-1-1.png')0.9s
Python
Loading viewer…
#hide# Deathsfig = px.choropleth(full_latest_grouped[full_latest_grouped['Deaths']>0],                     locations="Country/Region", locationmode='country names',                    color="Deaths", hover_name="Country/Region",                     range_color=[1,50], color_continuous_scale="agsunset",                    title='Countries with Deaths Reported')fig.update(layout_coloraxis_showscale=False)fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-1-2.png')1.3s
Python
Loading viewer…
Top 20 Countries
#hideflg = full_latest_groupedflg.head()0.1s
Python
#hidefig = px.bar(flg.sort_values('Confirmed', ascending=False).head(20).sort_values('Confirmed', ascending=True),              x="Confirmed", y="Country/Region", title='Confirmed Cases', text='Confirmed', orientation='h',              width=700, height=700, range_x = [0, max(flg['Confirmed'])+10000])fig.update_traces(marker_color=cnf, opacity=0.6, textposition='outside')fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-4-1.png')1.6s
Python
Loading viewer…
#hidefig = px.bar(flg.sort_values('Deaths', ascending=False).head(20).sort_values('Deaths', ascending=True),              x="Deaths", y="Country/Region", title='Deaths', text='Deaths', orientation='h',              width=700, height=700, range_x = [0, max(flg['Deaths'])+500])fig.update_traces(marker_color=dth, opacity=0.6, textposition='outside')fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-4-2.png')0.9s
Python
Loading viewer…
#hidefig = px.bar(flg.sort_values('Recovered', ascending=False).head(20).sort_values('Recovered', ascending=True),              x="Recovered", y="Country/Region", title='Recovered', text='Recovered', orientation='h',              width=700, height=700, range_x = [0, max(flg['Recovered'])+10000])fig.update_traces(marker_color=rec, opacity=0.6, textposition='outside')fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-4-3.png')1.2s
Python
Loading viewer…
#hidefig = px.bar(flg.sort_values('Active', ascending=False).head(20).sort_values('Active', ascending=True),              x="Active", y="Country/Region", title='Active', text='Active', orientation='h',              width=700, height=700, range_x = [0, max(flg['Active'])+3000])fig.update_traces(marker_color=act, opacity=0.6, textposition='outside')fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-4-4.png')1.3s
Python
Loading viewer…
#hide# (Only countries with more than 100 case are considered)flg['Mortality Rate'] = round((flg['Deaths']/flg['Confirmed'])*100, 2)temp = flg[flg['Confirmed']>100]temp = temp.sort_values('Mortality Rate', ascending=False)fig = px.bar(temp.sort_values('Mortality Rate', ascending=False).head(15).sort_values('Mortality Rate', ascending=True),              x="Mortality Rate", y="Country/Region", text='Mortality Rate', orientation='h',              width=700, height=600, range_x = [0, 8], title='No. of Deaths Per 100 Confirmed Case')fig.update_traces(marker_color=act, opacity=0.6, textposition='outside')fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-4-5.png')1.3s
Python
Loading viewer…
Composition of Cases
#hide_inputfig = px.treemap(full_latest.sort_values(by='Confirmed', ascending=False).reset_index(drop=True),                  path=["Country/Region", "Province/State"], values="Confirmed", height=700,                 title='Number of Confirmed Cases',                 color_discrete_sequence = px.colors.qualitative.Prism)fig.data[0].textinfo = 'label+text+value'fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-8-1.png')1.3s
Python
Loading viewer…
fig = px.treemap(full_latest.sort_values(by='Deaths', ascending=False).reset_index(drop=True),                  path=["Country/Region", "Province/State"], values="Deaths", height=700,                 title='Number of Deaths reported',                 color_discrete_sequence = px.colors.qualitative.Prism)fig.data[0].textinfo = 'label+text+value'fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-8-2.png')1.4s
Python
Loading viewer…
Epidemic Span
Note : In the graph, last day is shown as one day after the last time a new confirmed cases reported in the Country / Region
#hide_input# first date# ----------first_date = full_table[full_table['Confirmed']>0]first_date = first_date.groupby('Country/Region')['Date'].agg(['min']).reset_index()# first_date.head()from datetime import timedelta  # last date# ---------last_date = full_table.groupby(['Country/Region', 'Date', ])['Confirmed', 'Deaths', 'Recovered']last_date = last_date.sum().diff().reset_index()mask = last_date['Country/Region'] != last_date['Country/Region'].shift(1)last_date.loc[mask, 'Confirmed'] = np.nanlast_date.loc[mask, 'Deaths'] = np.nanlast_date.loc[mask, 'Recovered'] = np.nanlast_date = last_date[last_date['Confirmed']>0]last_date = last_date.groupby('Country/Region')['Date'].agg(['max']).reset_index()# last_date.head()# first_last# ----------first_last = pd.concat([first_date, last_date[['max']]], axis=1)# added 1 more day, which will show the next day as the day on which last case appearedfirst_last['max'] = first_last['max'] + timedelta(days=1)# no. of daysfirst_last['Days'] = first_last['max'] - first_last['min']# task column as countryfirst_last['Task'] = first_last['Country/Region']# rename columnsfirst_last.columns = ['Country/Region', 'Start', 'Finish', 'Days', 'Task']# sort by no. of daysfirst_last = first_last.sort_values('Days')# first_last.head()# visualization# --------------# produce random colorsclr = ["#"+''.join([random.choice('0123456789ABC') for j in range(6)]) for i in range(len(first_last))]#plotfig = ff.create_gantt(first_last, index_col='Country/Region', colors=clr,                       show_colorbar=False, bar_width=0.2, showgrid_x=True,                       showgrid_y=True, height=1600, title=('Gantt Chart'))fig.update_layout(margin=dict(t=80,l=0,r=0,b=0),                  autosize=False,width=700,height=5000)fig #.write_image('covid-eda-9-1.png')3.5s
Python
Loading viewer…
China vs. Not China
#hide# In Chinatemp = china.groupby('Date')['Confirmed', 'Deaths', 'Recovered'].sum().diff()temp = temp.reset_index()temp = temp.melt(id_vars="Date",                  value_vars=['Confirmed', 'Deaths', 'Recovered'])fig = px.bar(temp, x="Date", y="value", color='variable',              title='In China',             color_discrete_sequence=[cnf, dth, rec])fig.update_layout(barmode='group')fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-10-1.png')1.4s
Python
Loading viewer…
# ROWtemp = row.groupby('Date')['Confirmed', 'Deaths', 'Recovered'].sum().diff()temp = temp.reset_index()temp = temp.melt(id_vars="Date",                  value_vars=['Confirmed', 'Deaths', 'Recovered'])fig = px.bar(temp, x="Date", y="value", color='variable',              title='Outside China',             color_discrete_sequence=[cnf, dth, rec])fig.update_layout(barmode='group')fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-10-2.png')1.7s
Python
Loading viewer…
#hidedef from_china_or_not(row):    if row['Country/Region']=='China':        return 'From China'    else:        return 'Outside China'    temp = full_table.copy()temp['Region'] = temp.apply(from_china_or_not, axis=1)temp = temp.groupby(['Region', 'Date'])['Confirmed', 'Deaths', 'Recovered']temp = temp.sum().diff().reset_index()mask = temp['Region'] != temp['Region'].shift(1)temp.loc[mask, 'Confirmed'] = np.nantemp.loc[mask, 'Deaths'] = np.nantemp.loc[mask, 'Recovered'] = np.nanfig = px.bar(temp, x='Date', y='Confirmed', color='Region', barmode='group',              text='Confirmed', title='Confirmed', color_discrete_sequence= [cnf, dth, rec])fig.update_traces(textposition='outside')fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-10-3.png')1.6s
Python
Loading viewer…
fig = px.bar(temp, x='Date', y='Deaths', color='Region', barmode='group',              text='Confirmed', title='Deaths', color_discrete_sequence= [cnf, dth, rec])fig.update_traces(textposition='outside')fig.update_traces(textangle=-90)fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-10-4.png')1.3s
Python
Loading viewer…
#hidegdf = full_table.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths', 'Recovered'].max()gdf = gdf.reset_index()temp = gdf[gdf['Country/Region']=='China'].reset_index()temp = temp.melt(id_vars='Date', value_vars=['Confirmed', 'Deaths', 'Recovered'],                var_name='Case', value_name='Count')fig = px.bar(temp, x="Date", y="Count", color='Case', facet_col="Case",            title='China', color_discrete_sequence=[cnf, dth, rec])fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-10-5.png')1.5s
Python
Loading viewer…
temp = gdf[gdf['Country/Region']!='China'].groupby('Date').sum().reset_index()temp = temp.melt(id_vars='Date', value_vars=['Confirmed', 'Deaths', 'Recovered'],                var_name='Case', value_name='Count')fig = px.bar(temp, x="Date", y="Count", color='Case', facet_col="Case",             title='ROW', color_discrete_sequence=[cnf, dth, rec])fig.update_layout(margin=dict(t=80,l=0,r=0,b=0))fig #.write_image('covid-eda-10-6.png')1.3s
Python
Loading viewer…
Data By Country
Top 50 Countries By Confirmed Cases
#hide_inputtemp_f = full_latest_grouped.sort_values(by='Confirmed', ascending=False).head(50)temp_f = temp_f.reset_index(drop=True)temp_f.style.background_gradient(cmap='Reds')0.6s
Python
Top 25 Countries By Deaths Reported
#hide_inputtemp_flg = temp_f[temp_f['Deaths']>0][['Country/Region', 'Deaths']].head(25)temp_flg.sort_values('Deaths', ascending=False).reset_index(drop=True).style.background_gradient(cmap='Reds')0.5s
Python
Top 25 Chinese Provinces By Confirmed Cases
#hide_inputtemp_f = china_latest_grouped[['Province/State', 'Confirmed', 'Deaths', 'Recovered']]temp_f = temp_f.sort_values(by='Confirmed', ascending=False)temp_f = temp_f.reset_index(drop=True)temp_f.style.background_gradient(cmap='Pastel1_r')0.6s
Python
Related Work
- https://www.kaggle.com/imdevskp/mers-outbreak-analysis 
- https://www.kaggle.com/imdevskp/sars-2003-outbreak-analysis 
- https://www.kaggle.com/imdevskp/western-africa-ebola-outbreak-analysis