# Estimating The Mortality Rate For COVID-19

Adapted from analyses by Joseph Richards

Using Country-Level Covariates To Correct For Testing & Reporting Biases And Estimate a True Mortality Rate. ```python id=d8f227e1-1beb-4b5c-a22b-70727c1aec1d #hide # Setup and imports %matplotlib inline import warnings warnings.simplefilter('ignore') import matplotlib.pyplot as plt import numpy as np import pandas as pd import pymc3 as pm from IPython.display import display, Markdown ``` ```python id=3cbd5b05-e2af-4163-b5fe-4c9c1c1378fd #hide # constants ignore_countries = [ 'Others', 'Cruise Ship' ] cpi_country_mapping = { 'United States of America': 'US', 'China': 'Mainland China' } wb_country_mapping = { 'United States': 'US', 'Egypt, Arab Rep.': 'Egypt', 'Hong Kong SAR, China': 'Hong Kong', 'Iran, Islamic Rep.': 'Iran', 'China': 'Mainland China', 'Russian Federation': 'Russia', 'Slovak Republic': 'Slovakia', 'Korea, Rep.': 'Korea, South' } wb_covariates = [ ('SH.XPD.OOPC.CH.ZS', 'healthcare_oop_expenditure'), ('SH.MED.BEDS.ZS', 'hospital_beds'), ('HD.HCI.OVRL', 'hci'), ('SP.POP.65UP.TO.ZS', 'population_perc_over65'), ('SP.RUR.TOTL.ZS', 'population_perc_rural') ] ``` ```python id=5bd29190-0f16-4e83-a285-f3759b4dc744 #hide # data loading and manipulation from datetime import datetime import os import numpy as np import pandas as pd def get_all_data(): ''' Main routine that grabs all COVID and covariate data and returns them as a single dataframe that contains: * count of cumulative cases and deaths by country (by today's date) * days since first case for each country * CPI gov't transparency index * World Bank data on population, healthcare, etc. by country ''' all_covid_data = _get_latest_covid_timeseries() covid_cases_rollup = _rollup_by_country(all_covid_data['Confirmed']) covid_deaths_rollup = _rollup_by_country(all_covid_data['Deaths']) todays_date = covid_cases_rollup.columns.max() # Create DataFrame with today's cumulative case and death count, by country df_out = pd.DataFrame({'cases': covid_cases_rollup[todays_date], 'deaths': covid_deaths_rollup[todays_date]}) _clean_country_list(df_out) _clean_country_list(covid_cases_rollup) # Add observed death rate: df_out['death_rate_observed'] = df_out.apply( lambda row: row['deaths'] / float(row['cases']), axis=1) # Add covariate for days since first case df_out['days_since_first_case'] = _compute_days_since_first_case( covid_cases_rollup) # Add CPI covariate: _add_cpi_data(df_out) # Add World Bank covariates: _add_wb_data(df_out) # Drop any country w/o covariate data: num_null = df_out.isnull().sum(axis=1) to_drop_idx = df_out.index[num_null > 1] print('Dropping %i/%i countries due to lack of data' % (len(to_drop_idx), len(df_out))) df_out.drop(to_drop_idx, axis=0, inplace=True) return df_out, todays_date def _get_latest_covid_timeseries(): ''' Pull latest time-series data from JHU CSSE database ''' repo = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/' data_path = 'csse_covid_19_data/csse_covid_19_time_series/' all_data = {} for status in ['Confirmed', 'Deaths', 'Recovered']: file_name = 'time_series_19-covid-%s.csv' % status all_data[status] = pd.read_csv( '%s%s%s' % (repo, data_path, file_name)) return all_data def _rollup_by_country(df): ''' Roll up each raw time-series by country, adding up the cases across the individual states/provinces within the country :param df: Pandas DataFrame of raw data from CSSE :return: DataFrame of country counts ''' gb = df.groupby('Country/Region') df_rollup = gb.sum() df_rollup.drop(['Lat', 'Long'], axis=1, inplace=True, errors='ignore') # Drop dates with all 0 count data df_rollup.drop(df_rollup.columns[df_rollup.sum(axis=0) == 0], axis=1, inplace=True) # Convert column strings to dates: idx_as_dt = [datetime.strptime(x, '%m/%d/%y') for x in df_rollup.columns] df_rollup.columns = idx_as_dt return df_rollup def _clean_country_list(df): ''' Clean up input country list in df ''' # handle recent changes in country names: country_rename = { 'Hong Kong SAR': 'Hong Kong', 'Taiwan*': 'Taiwan', 'Czechia': 'Czech Republic', 'Brunei': 'Brunei Darussalam', 'Iran (Islamic Republic of)': 'Iran', 'Viet Nam': 'Vietnam', 'Russian Federation': 'Russia', 'Republic of Korea': 'South Korea', 'Republic of Moldova': 'Moldova', 'China': 'Mainland China' } df.rename(country_rename, axis=0, inplace=True) df.drop(ignore_countries, axis=0, inplace=True, errors='ignore') def _compute_days_since_first_case(df_cases): ''' Compute the country-wise days since first confirmed case :param df_cases: country-wise time-series of confirmed case counts :return: Series of country-wise days since first case ''' date_first_case = df_cases[df_cases > 0].idxmin(axis=1) days_since_first_case = date_first_case.apply( lambda x: (df_cases.columns.max() - x).days) # Add 1 month for China, since outbreak started late 2019: days_since_first_case.loc['Mainland China'] += 30 return days_since_first_case def _add_cpi_data(df_input): ''' Add the Government transparency (CPI - corruption perceptions index) data (by country) as a column in the COVID cases dataframe. :param df_input: COVID-19 data rolled up country-wise :return: None, add CPI data to df_input in place ''' cpi_data = pd.read_excel( 'https://github.com/jwrichar/COVID19-mortality/blob/master/data/CPI2019.xlsx?raw=true', skiprows=2) cpi_data.set_index('Country', inplace=True, drop=True) cpi_data.rename(cpi_country_mapping, axis=0, inplace=True) # Add CPI score to input df: df_input['cpi_score_2019'] = cpi_data['CPI score 2019'] def _add_wb_data(df_input): ''' Add the World Bank data covariates as columns in the COVID cases dataframe. :param df_input: COVID-19 data rolled up country-wise :return: None, add World Bank data to df_input in place ''' wb_data = pd.read_csv( 'https://raw.githubusercontent.com/jwrichar/COVID19-mortality/master/data/world_bank_data.csv', na_values='..') for (wb_name, var_name) in wb_covariates: wb_series = wb_data.loc[wb_data['Series Code'] == wb_name] wb_series.set_index('Country Name', inplace=True, drop=True) wb_series.rename(wb_country_mapping, axis=0, inplace=True) # Add WB data: df_input[var_name] = _get_most_recent_value(wb_series) def _get_most_recent_value(wb_series): ''' Get most recent non-null value for each country in the World Bank time-series data ''' ts_data = wb_series[wb_series.columns[3::]] def _helper(row): row_nn = row[row.notnull()] if len(row_nn): return row_nn[-1] else: return np.nan return ts_data.apply(_helper, axis=1) ``` ```python id=09f554d3-121e-4b98-a362-5b976ef950b0 #hide # Load the data (see source/data.py): df, todays_date = get_all_data() # Impute NA's column-wise: df = df.apply(lambda x: x.fillna(x.mean()),axis=0) ``` # Observed mortality rates ```python id=9ba556f2-1401-421d-a81c-1a623c2d55c9 #collapse-hide display(Markdown('Data as of %s' % todays_date)) reported_mortality_rate = df['deaths'].sum() / df['cases'].sum() display(Markdown('Overall reported mortality rate: %.2f%%' % (100.0 * reported_mortality_rate))) df_highest = df.sort_values('cases', ascending=False).head(15) mortality_rate = pd.Series( data=(df_highest['deaths']/df_highest['cases']).values, index=map(lambda x: '%s (%i cases)' % (x, df_highest.loc[x]['cases']), df_highest.index)) ax = mortality_rate.plot.bar( figsize=(14,7), title='Reported Mortality Rate by Country (countries w/ highest case counts)') ax.axhline(reported_mortality_rate, color='k', ls='--') plt.show() ``` # Model Estimate COVID-19 mortality rate, controling for country factors. ```python id=2d02033e-4bc3-4958-9d6e-027d92bc72cf #hide import numpy as np import pymc3 as pm def initialize_model(df): # Normalize input covariates in a way that is sensible: # (1) days since first case: upper # mu_0 to reflect asymptotic mortality rate months after outbreak _normalize_col(df, 'days_since_first_case', how='upper') # (2) CPI score: upper # mu_0 to reflect scenario in absence of corrupt govts _normalize_col(df, 'cpi_score_2019', how='upper') # (3) healthcare OOP spending: mean # not sure which way this will go _normalize_col(df, 'healthcare_oop_expenditure', how='mean') # (4) hospital beds: upper # more beds, more healthcare and tests _normalize_col(df, 'hospital_beds', how='mean') # (5) hci = human capital index: upper # HCI measures education/health; mu_0 should reflect best scenario _normalize_col(df, 'hci', how='mean') # (6) % over 65: mean # mu_0 to reflect average world demographic _normalize_col(df, 'population_perc_over65', how='mean') # (7) % rural: mean # mu_0 to reflect average world demographic _normalize_col(df, 'population_perc_rural', how='mean') n = len(df) covid_mortality_model = pm.Model() with covid_mortality_model: # Priors: mu_0 = pm.Beta('mu_0', alpha=0.3, beta=10) sig_0 = pm.Uniform('sig_0', lower=0.0, upper=mu_0 * (1 - mu_0)) beta = pm.Normal('beta', mu=0, sigma=5, shape=7) sigma = pm.HalfNormal('sigma', sigma=5) # Model mu from country-wise covariates: # Apply logit transformation so logistic regression performed mu_0_logit = np.log(mu_0 / (1 - mu_0)) mu_est = mu_0_logit + \ beta[0] * df['days_since_first_case_normalized'].values + \ beta[1] * df['cpi_score_2019_normalized'].values + \ beta[2] * df['healthcare_oop_expenditure_normalized'].values + \ beta[3] * df['hospital_beds_normalized'].values + \ beta[4] * df['hci_normalized'].values + \ beta[5] * df['population_perc_over65_normalized'].values + \ beta[6] * df['population_perc_rural_normalized'].values mu_model_logit = pm.Normal('mu_model_logit', mu=mu_est, sigma=sigma, shape=n) # Transform back to probability space: mu_model = np.exp(mu_model_logit) / (np.exp(mu_model_logit) + 1) # tau_i, mortality rate for each country # Parametrize with (mu, sigma) # instead of (alpha, beta) to ease interpretability. tau = pm.Beta('tau', mu=mu_model, sigma=sig_0, shape=n) # tau = pm.Beta('tau', mu=mu_0, sigma=sig_0, shape=n) # Binomial likelihood: d_obs = pm.Binomial('d_obs', n=df['cases'].values, p=tau, observed=df['deaths'].values) return covid_mortality_model def _normalize_col(df, colname, how='mean'): ''' Normalize an input column in one of 3 ways: * how=mean: unit normal N(0,1) * how=upper: normalize to [-1, 0] with highest value set to 0 * how=lower: normalize to [0, 1] with lowest value set to 0 Returns df modified in place with extra column added. ''' colname_new = '%s_normalized' % colname if how == 'mean': mu = df[colname].mean() sig = df[colname].std() df[colname_new] = (df[colname] - mu) / sig elif how == 'upper': maxval = df[colname].max() minval = df[colname].min() df[colname_new] = (df[colname] - maxval) / (maxval - minval) elif how == 'lower': maxval = df[colname].max() minval = df[colname].min() df[colname_new] = (df[colname] - minval) / (maxval - minval) ``` ```python id=2b9746ee-b665-4b8c-8cdb-6bd685b51b17 #hide # Initialize the model: mod = initialize_model(df) # Run MCMC sampler1 with mod: trace = pm.sample(300, tune=100, chains=3, cores=2) ``` ```python id=d1e36029-5ac2-4227-8cce-5c25e9339d2a #collapse-hide n_samp = len(trace['mu_0']) mu0_summary = pm.summary(trace).loc['mu_0'] print("COVID-19 Global Mortality Rate Estimation:") print("Posterior mean: %0.2f%%" % (100*trace['mu_0'].mean())) print("Posterior median: %0.2f%%" % (100*np.median(trace['mu_0']))) lower = np.sort(trace['mu_0'])[int(n_samp*0.025)] upper = np.sort(trace['mu_0'])[int(n_samp*0.975)] print("95%% posterior interval: (%0.2f%%, %0.2f%%)" % (100*lower, 100*upper)) prob_lt_reported = sum(trace['mu_0'] < reported_mortality_rate) / len(trace['mu_0']) print("Probability true rate less than reported rate (%.2f%%) = %.2f%%" % (100*reported_mortality_rate, 100*prob_lt_reported)) print("") # Posterior plot for mu0 print('Posterior probability density for COVID-19 mortality rate, controlling for country factors:') ax = pm.plot_posterior(trace, var_names=['mu_0'], figsize=(18, 8), textsize=18, credible_interval=0.95, bw=3.0, lw=3, kind='kde', ref_val=round(reported_mortality_rate, 3)) ``` ## Magnitude and Significance of Factors For bias in reported COVID-19 mortality rate ```python id=8490481a-e19d-4637-9548-b1666fa7bf8d #collapse-hide # Posterior summary for the beta parameters: beta_summary = pm.summary(trace).head(7) beta_summary.index = ['days_since_first_case', 'cpi', 'healthcare_oop', 'hospital_beds', 'hci', 'percent_over65', 'percent_rural'] beta_summary.reset_index(drop=False, inplace=True) err_vals = ((beta_summary['hpd_3%'] - beta_summary['mean']).values, (beta_summary['hpd_97%'] - beta_summary['mean']).values) ax = beta_summary.plot(x='index', y='mean', kind='bar', figsize=(14, 7), title='Posterior Distribution of Beta Parameters', yerr=err_vals, color='lightgrey', legend=False, grid=True, capsize=5) beta_summary.plot(x='index', y='mean', color='k', marker='o', linestyle='None', ax=ax, grid=True, legend=False, xlim=plt.gca().get_xlim()) plt #.savefig('../images/corvid-mortality.png') ``` # About This Analysis This analysis was done by [Joseph Richards](https://twitter.com/joeyrichar) In this project\[3\], we attempt to estimate the true mortality rate\[1\] for COVID-19 while controlling for country-level covariates\[2\]\[4\] such as: * age of outbreak in the country * transparency of the country's government * access to healthcare * demographics such as age of population and rural vs. urban Estimating a mortality rate lower than the overall reported rate likely implies that there has been **significant under-testing and under-reporting of cases globally**. ## Interpretation of Country-Level Parameters 1. days_since_first_case - positive (very statistically significant). As time since outbreak increases, expected mortality rate **increases**, as expected. 2. cpi - negative (statistically significant). As government transparency increases, expected mortality rate **decreases**. This may mean that less transparent governments under-report cases, hence inflating the mortality rate. 3. healthcare avg. out-of-pocket spending - no significant trend. 4. hospital beds per capita - no significant trend. 5. Human Capital Index - no significant trend (slightly negative = mortality rates decrease with increased mobilization of the country) 6. percent over 65 - positive (statistically significant). As population age increases, the mortality rate also **increases**, as expected. 7. percent rural - no significant trend. \[1\]: As of March 10, the **overall reported mortality rate is 3.5%**. However, this figure does not account for **systematic biases in case reporting and testing**. The observed mortality of COVID-19 has varied widely from country to country (as of early March 2020). For instance, as of March 10, mortality rates have ranged from < 0.1% in places like Germany (1100+ cases) to upwards of 5% in Italy (9000+ cases) and 3.9% in China (80k+ cases). \[2\]: The point of our modelling work here is to **try to understand and correct for the country-to-country differences that may cause the observed discrepancies in COVID-19 country-wide mortality rates**. That way we can "undo" those biases and try to **pin down an overall *real* mortality rate**. \[3\]: Full details about the model are available at: \[4\]: The affects of these parameters are subject to change as more data are collected. # Appendix: Model Diagnostics The following trace plots help to assess the convergence of the MCMC sampler. ```python id=b1ceae2c-6b02-473c-9733-3c86dfc94815 #hide_input import arviz as az az.plot_trace(trace, compact=True); ```
This notebook was exported from https://nextjournal.com/a/MKo5CK6fpCwWx1dekQW9k?change-id=CfRVduFvvVxEMkpKUkW9qk ```edn nextjournal-metadata {:article {:nodes {"09f554d3-121e-4b98-a362-5b976ef950b0" {:compute-ref #uuid "d01fc5e8-0c7d-4ac5-8a03-479f5fa93ab1", :exec-duration 4241, :id "09f554d3-121e-4b98-a362-5b976ef950b0", :kind "code", :output-log-lines {:stdout 2}, :runtime [:runtime "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4"]}, "2b9746ee-b665-4b8c-8cdb-6bd685b51b17" {:compute-ref #uuid "42a866ab-e222-4a93-a040-4f4d9d1207f9", :exec-duration 395076, :id "2b9746ee-b665-4b8c-8cdb-6bd685b51b17", :kind "code", :output-log-lines {:stdout 15}, :runtime [:runtime "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4"]}, "2d02033e-4bc3-4958-9d6e-027d92bc72cf" {:compute-ref #uuid "5dc6afc8-36b6-4180-9af0-043c68a4d921", :exec-duration 101, :id "2d02033e-4bc3-4958-9d6e-027d92bc72cf", :kind "code", :output-log-lines {}, :runtime [:runtime "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4"]}, "3cbd5b05-e2af-4163-b5fe-4c9c1c1378fd" {:compute-ref #uuid "4862320f-a83d-4b19-bfba-7a96c0e7301d", :exec-duration 42, :id "3cbd5b05-e2af-4163-b5fe-4c9c1c1378fd", :kind "code", :output-log-lines {}, :runtime [:runtime "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4"]}, "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4" {:environment [:environment {:article/nextjournal.id #uuid "02d62115-190f-427d-9c12-6d7432f79b6f", :change/nextjournal.id #uuid "5e6fb365-67ba-47dc-b0f1-4673105b7e4f", :node/id "b6b18383-c9c7-46a7-b794-c44c05bfdd74"}], :id "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4", :kind "runtime", :language "python", :type :jupyter}, "5bd29190-0f16-4e83-a285-f3759b4dc744" {:compute-ref #uuid "06018c24-a935-4df6-9d39-f4ab1e95c3f6", :exec-duration 127, :id "5bd29190-0f16-4e83-a285-f3759b4dc744", :kind "code", :output-log-lines {}, :runtime [:runtime "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4"]}, "8490481a-e19d-4637-9548-b1666fa7bf8d" {:compute-ref #uuid "fd9146b6-b6e6-4071-8d07-e7d3d7adba87", :exec-duration 3792, :id "8490481a-e19d-4637-9548-b1666fa7bf8d", :kind "code", :output-log-lines {}, :runtime [:runtime "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4"]}, "9ba556f2-1401-421d-a81c-1a623c2d55c9" {:compute-ref #uuid "741fb8a7-7044-4b6a-b5af-bf972b98413a", :exec-duration 1131, :id "9ba556f2-1401-421d-a81c-1a623c2d55c9", :kind "code", :output-log-lines {}, :runtime [:runtime "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4"]}, "b1ceae2c-6b02-473c-9733-3c86dfc94815" {:compute-ref #uuid "551efa2a-5728-422c-b3ae-8ab1e02afacd", :exec-duration 16424, :id "b1ceae2c-6b02-473c-9733-3c86dfc94815", :kind "code", :output-log-lines {}, :runtime [:runtime "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4"]}, "d1e36029-5ac2-4227-8cce-5c25e9339d2a" {:compute-ref #uuid "df962241-0909-497a-8cb6-ff13913f708e", :exec-duration 4680, :id "d1e36029-5ac2-4227-8cce-5c25e9339d2a", :kind "code", :output-log-lines {:stdout 8}, :runtime [:runtime "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4"]}, "d8f227e1-1beb-4b5c-a22b-70727c1aec1d" {:compute-ref #uuid "28ad6b9b-91e4-4437-baf5-b6be1f25e030", :exec-duration 8062, :id "d8f227e1-1beb-4b5c-a22b-70727c1aec1d", :kind "code", :output-log-lines {}, :runtime [:runtime "568c175c-cdb7-4e0a-b3a2-ffc5a7f44fe4"]}}, :nextjournal/id #uuid "02d67534-590c-4829-9438-a26fb789d477", :article/change {:nextjournal/id #uuid "5e71e221-00b4-4d12-a9dd-5e9917d62143"}}} ```