Fitting COVID data
I propose here a simple method to fit experimental data common to epidemiological spreads, such as the present COVID-19 pandemic, using the inverse gaussian distribution. This follows the general incompregension of my answer to the question Is the COVID-19 pandemic curve a Gaussian curve? on StackOverflow. My initial point is to say that a Gaussian is not adapted as it handles a distribution on real numbers, while such a curve (the variable being number of days) handles numbers on the half line. Inspired by the excellent A Theory of Reaction Time Distributions by Dr Fermin Moscoso del Prado Martin a constructive approach is to propose another distribution, such as the inverse Gaussian distribution.
This notebook develops this same idea on real data and proves numerically how bad the Gaussian fit is compared to the latter. Thinking about the importance of doing a proper inference in such a case, I conclude
In this notebook, I define a fitting method using pytorch which fits data to a the inverse gaussian distribution :
To learn more about some properties / implementation of this pdf, see scipy.stats documentation.
from __future__ import division, print_function
import numpy as np
np.set_printoptions(precision=6, suppress=True)
import os
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
phi = (np.sqrt(5)+1)/2
fig_width = 10
figsize = (fig_width, fig_width/phi)
getting real COVID data¶
retrieve data¶
We will here use data provided openly by the french government: https://github.com/opencovid19-fr/data and use the following URL:
URL, data_cache = 'https://raw.githubusercontent.com/opencovid19-fr/data/master/dist/chiffres-cles.json', '/tmp/covid_fr.json'
import pandas as pd
Let's cache the data locally to avoid downloading it again and again:
try:
df = pd.read_json(data_cache)
print('loading cache')
except:
df = pd.read_json(URL)
print('loading from internet')
df.to_json(data_cache)
df
Let's convert the date column into a proper date:
df['date'] = pd.to_datetime(df['date'])
df
... and extract the different data codes:
df['code'].unique()
filtering data¶
Selecting one region:
code = 'FRA'
code = 'REG-93'
df_loc = df[df['code']==code]
Selecting the two columns of interest:
df_loc = df_loc[['date', 'deces']]
df_loc
Removing that containing NaNs:
df_loc = df_loc[df_loc['deces'].notna()]
Selecting a date range:
start_date = '2020-02-23'
df_loc['date'] > start_date
df_loc = df_loc[start_date < df_loc['date']]
stop_date = '2020-06-23'
df_loc = df_loc[df_loc['date']<stop_date]
fig, ax = plt.subplots(figsize=figsize)
df_loc.plot(x='date', y='deces', ax=ax)
ax.set_title('Cumulative number of deaths');
returning a numpy array¶
death = np.array(df_loc['deces'])
df_loc['date'], death
death.shape, np.diff(death).shape
death[1:] = np.diff(death)
death[0] = 0
death
df_loc['death'] = death
df_loc
fig, ax = plt.subplots(figsize=figsize)
df_loc.plot(x='date', y='death', ax=ax)
ax.set_title('Daily rate of deaths');
performing the fit¶
converting data to fit¶
X, y = df_loc['date'], df_loc['death']
X.shape, y.shape
X = X.astype(int)
X = np.array(X)
X
INTS_PER_DAY = 86400000000000
X = (X-X[0])//INTS_PER_DAY
np.array(X)
X
y = np.array(y)
y
using torch¶
or "when you have a new hammer, everything looks like a nail, seehttps://en.wikipedia.org/wiki/Law_of_the_instrument
Let's use the definition of the pdf:
$$ f ( x ; \mu , \lambda ) = \sqrt {\frac {\lambda }{2\pi x^3}} \exp (-\frac {\lambda (x-\mu )^{2}}{2\mu ^{2}x} ) $$and a previous post where I used pyTorch to fit psychophysical data:
import torch
from torch.utils.data import TensorDataset, DataLoader
torch.set_default_tensor_type("torch.DoubleTensor")
criterion = torch.nn.MSELoss(reduction="sum")
# https://pytorch.org/docs/stable/generated/torch.nn.L1Loss.html#torch.nn.L1Loss
#criterion = torch.nn.L1Loss(reduction="sum")
class CovidRegressionModel(torch.nn.Module):
def __init__(self, mu=80, tau=24,
log_amp=torch.log(500*torch.ones(1)),
log_lambda=torch.log(100*torch.ones(1)),
):
super(CovidRegressionModel, self).__init__()
self.tau = torch.nn.Parameter(tau * torch.ones(1))
self.mu = torch.nn.Parameter(mu * torch.ones(1))
# when modeling a stricly positive number, a good habit is to use their log as the fitted parameter:
self.log_amp = torch.nn.Parameter(log_amp * torch.ones(1))
self.log_lambda = torch.nn.Parameter(log_lambda * torch.ones(1))
def forward(self, x):
out = (x > self.tau) * torch.exp(self.log_amp)
date = (x-self.tau) #
date[x<=self.tau] = 1 # to avoid NaNs in the output value (and in the gradient)
out *= torch.sqrt(torch.exp(self.log_lambda) / (2 * np.pi * date ** 3))
out *= torch.exp(-torch.exp(self.log_lambda) * (date - self.mu) ** 2 / (2 * self.mu ** 2 * date))
out[x<=self.tau] = 0 # overwriting values before the onset
return out
learning_rate = 0.005
beta1, beta2 = 0.9, 0.999
betas = (beta1, beta2)
num_epochs = 2 ** 9 + 1
batch_size = 16
amsgrad = False # gives similar results
amsgrad = True # gives similar results
def fit_data(
X,
y,
learning_rate=learning_rate,
batch_size=batch_size, # gamma=gamma,
num_epochs=num_epochs,
betas=betas,
verbose=False, **kwargs
):
variables, labels = torch.Tensor(X[:, None]), torch.Tensor(y[:, None])
loader = DataLoader(
TensorDataset(variables, labels), batch_size=batch_size, shuffle=True
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
covid_model = CovidRegressionModel()
covid_model = covid_model.to(device)
covid_model.train()
optimizer = torch.optim.Adam(
covid_model.parameters(), lr=learning_rate, betas=betas, amsgrad=amsgrad
)
for epoch in range(int(num_epochs)):
covid_model.train()
losses = []
for variables_, labels_ in loader:
variables_, labels_ = variables_.to(device), labels_.to(device)
outputs = covid_model(variables_)
loss = criterion(outputs, labels_)
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(loss.item())
if verbose and (epoch % (num_epochs // 32) == 0):
print(f"Iteration: {epoch} - Loss: {np.sum(losses)/len(variables):.5f}")
covid_model.eval()
variables, labels = torch.Tensor(X[:, None]), torch.Tensor(y[:, None])
outputs = covid_model(variables)
loss = criterion(outputs, labels).item() / len(variables)
return covid_model, loss
covid_model, loss = fit_data(X, y, verbose=True)
print("Final loss =", loss)
covid_model.eval()
outputs = covid_model(torch.Tensor(X[:, None]))
y_pred = outputs.detach().numpy()
fig, ax = plt.subplots(figsize=figsize)
ax.plot(X, y, '*', label='data')
ax.plot(X, y_pred, '--', label='fit')
ax.vlines(covid_model.mu.item(), y.min(), y.max(), colors='g', linestyles='--', label=f'$\mu$={covid_model.mu.item():.1f} (mean date)')
ax.vlines(covid_model.tau.item(), y.min(), y.max(), colors='r', linestyles='--', label=fr'$\tau$={covid_model.tau.item():.1f} (day of onset)')
ax.plot([], [], label=f'A={torch.exp(covid_model.log_amp).item():.1f} (amplitude)')
ax.plot([], [], label=f'$\lambda$={torch.exp(covid_model.log_lambda).item():.1f} (spread in days)')
ax.legend()
ax.set_xlabel(f'# number of days since {start_date}');
ax.set_ylabel(f'# number of dead per day');
ax.set_title('Fitting Daily rate of deaths (PACA-FR)');
some book keeping for the notebook¶
%load_ext watermark
%watermark -i -h -m -v -p numpy,pandas,matplotlib,torch -r -g -b