import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
import itertools
from pylab import rcParams
warnings.filterwarnings('ignore')
rcParams['figure.figsize'] = 18, 15
sns.set_style('darkgrid')
''' reading dataset '''
df_city_day = pd.read_csv('city_day.csv')
''' displaying 5 rows '''
df_city_day.head()
City | Date | PM2.5 | PM10 | NO | NO2 | NOx | NH3 | CO | SO2 | O3 | Benzene | Toluene | Xylene | AQI | AQI_Bucket | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Ahmedabad | 2015-01-01 | NaN | NaN | 0.92 | 18.22 | 17.15 | NaN | 0.92 | 27.64 | 133.36 | 0.00 | 0.02 | 0.00 | NaN | NaN |
1 | Ahmedabad | 2015-01-02 | NaN | NaN | 0.97 | 15.69 | 16.46 | NaN | 0.97 | 24.55 | 34.06 | 3.68 | 5.50 | 3.77 | NaN | NaN |
2 | Ahmedabad | 2015-01-03 | NaN | NaN | 17.40 | 19.30 | 29.70 | NaN | 17.40 | 29.07 | 30.70 | 6.80 | 16.40 | 2.25 | NaN | NaN |
3 | Ahmedabad | 2015-01-04 | NaN | NaN | 1.70 | 18.48 | 17.97 | NaN | 1.70 | 18.59 | 36.08 | 4.43 | 10.14 | 1.00 | NaN | NaN |
4 | Ahmedabad | 2015-01-05 | NaN | NaN | 22.10 | 21.42 | 37.76 | NaN | 22.10 | 39.33 | 39.31 | 7.01 | 18.89 | 2.78 | NaN | NaN |
''' checking null values '''
df_city_day.isnull().sum()
City 0 Date 0 PM2.5 4598 PM10 11140 NO 3582 NO2 3585 NOx 4185 NH3 10328 CO 2059 SO2 3854 O3 4022 Benzene 5623 Toluene 8041 Xylene 18109 AQI 4681 AQI_Bucket 4681 dtype: int64
''' shape of dataset '''
df_city_day.shape
(29531, 16)
''' percentage of missing values in each column '''
for c in df_city_day.columns:
null_values = df_city_day[c].isna().sum()
percentage = (null_values / len(df_city_day)) * 100
print("In {}, mean of null value is: {}".format(c, percentage))
print("-" * 100)
In City, mean of null value is: 0.0 ---------------------------------------------------------------------------------------------------- In Date, mean of null value is: 0.0 ---------------------------------------------------------------------------------------------------- In PM2.5, mean of null value is: 15.570078900138837 ---------------------------------------------------------------------------------------------------- In PM10, mean of null value is: 37.72307067149775 ---------------------------------------------------------------------------------------------------- In NO, mean of null value is: 12.129626494192543 ---------------------------------------------------------------------------------------------------- In NO2, mean of null value is: 12.139785310351833 ---------------------------------------------------------------------------------------------------- In NOx, mean of null value is: 14.17154854220988 ---------------------------------------------------------------------------------------------------- In NH3, mean of null value is: 34.97341776438319 ---------------------------------------------------------------------------------------------------- In CO, mean of null value is: 6.9723341573262 ---------------------------------------------------------------------------------------------------- In SO2, mean of null value is: 13.050692492634857 ---------------------------------------------------------------------------------------------------- In O3, mean of null value is: 13.619586197555112 ---------------------------------------------------------------------------------------------------- In Benzene, mean of null value is: 19.041007754563 ---------------------------------------------------------------------------------------------------- In Toluene, mean of null value is: 27.229013578950934 ---------------------------------------------------------------------------------------------------- In Xylene, mean of null value is: 61.32200060952897 ---------------------------------------------------------------------------------------------------- In AQI, mean of null value is: 15.851139480545868 ---------------------------------------------------------------------------------------------------- In AQI_Bucket, mean of null value is: 15.851139480545868 ----------------------------------------------------------------------------------------------------
''' filling null values with mean of each column '''
for c in df_city_day.columns:
if df_city_day[c].isna().sum() > 0:
if df_city_day[c].dtype == 'float64':
df_city_day[c] = df_city_day[c].fillna(df_city_day[c].mean())
elif df_city_day[c].dtype == 'object':
df_city_day[c] = df_city_day[c].fillna(df_city_day[c].value_counts().index[0])
'''again checking null values'''
df_city_day.isna().sum()
City 0 Date 0 PM2.5 0 PM10 0 NO 0 NO2 0 NOx 0 NH3 0 CO 0 SO2 0 O3 0 Benzene 0 Toluene 0 Xylene 0 AQI 0 AQI_Bucket 0 dtype: int64
''' info of dataset '''
df_city_day.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 29531 entries, 0 to 29530 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 City 29531 non-null object 1 Date 29531 non-null object 2 PM2.5 29531 non-null float64 3 PM10 29531 non-null float64 4 NO 29531 non-null float64 5 NO2 29531 non-null float64 6 NOx 29531 non-null float64 7 NH3 29531 non-null float64 8 CO 29531 non-null float64 9 SO2 29531 non-null float64 10 O3 29531 non-null float64 11 Benzene 29531 non-null float64 12 Toluene 29531 non-null float64 13 Xylene 29531 non-null float64 14 AQI 29531 non-null float64 15 AQI_Bucket 29531 non-null object dtypes: float64(13), object(3) memory usage: 3.6+ MB
''' count of cities '''
city_label = df_city_day.City.value_counts().nlargest(10)
''' barplot '''
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(city_label.index, city_label)
plt.xlabel('City', fontsize=20)
plt.ylabel('Count', fontsize=20);
''' making date as index of data '''
# df.hea
df_city_day.index = pd.DatetimeIndex(df_city_day['Date'])
df_city_day.head()
City | Date | PM2.5 | PM10 | NO | NO2 | NOx | NH3 | CO | SO2 | O3 | Benzene | Toluene | Xylene | AQI | AQI_Bucket | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||||||||
2015-01-01 | Ahmedabad | 2015-01-01 | 67.450578 | 118.127103 | 0.92 | 18.22 | 17.15 | 23.483476 | 0.92 | 27.64 | 133.36 | 0.00 | 0.02 | 0.00 | 166.463581 | Moderate |
2015-01-02 | Ahmedabad | 2015-01-02 | 67.450578 | 118.127103 | 0.97 | 15.69 | 16.46 | 23.483476 | 0.97 | 24.55 | 34.06 | 3.68 | 5.50 | 3.77 | 166.463581 | Moderate |
2015-01-03 | Ahmedabad | 2015-01-03 | 67.450578 | 118.127103 | 17.40 | 19.30 | 29.70 | 23.483476 | 17.40 | 29.07 | 30.70 | 6.80 | 16.40 | 2.25 | 166.463581 | Moderate |
2015-01-04 | Ahmedabad | 2015-01-04 | 67.450578 | 118.127103 | 1.70 | 18.48 | 17.97 | 23.483476 | 1.70 | 18.59 | 36.08 | 4.43 | 10.14 | 1.00 | 166.463581 | Moderate |
2015-01-05 | Ahmedabad | 2015-01-05 | 67.450578 | 118.127103 | 22.10 | 21.42 | 37.76 | 23.483476 | 22.10 | 39.33 | 39.31 | 7.01 | 18.89 | 2.78 | 166.463581 | Moderate |
''' dropping some columns '''
df_city_day.drop(['City' , 'PM2.5' , 'PM10','Benzene' , 'Toluene', 'Xylene' ,'AQI' ,'AQI_Bucket'], axis=1, inplace=True)
''' after dropping some columns, data looks like '''
df_city_day.head()
Date | NO | NO2 | NOx | NH3 | CO | SO2 | O3 | |
---|---|---|---|---|---|---|---|---|
Date | ||||||||
2015-01-01 | 2015-01-01 | 0.92 | 18.22 | 17.15 | 23.483476 | 0.92 | 27.64 | 133.36 |
2015-01-02 | 2015-01-02 | 0.97 | 15.69 | 16.46 | 23.483476 | 0.97 | 24.55 | 34.06 |
2015-01-03 | 2015-01-03 | 17.40 | 19.30 | 29.70 | 23.483476 | 17.40 | 29.07 | 30.70 |
2015-01-04 | 2015-01-04 | 1.70 | 18.48 | 17.97 | 23.483476 | 1.70 | 18.59 | 36.08 |
2015-01-05 | 2015-01-05 | 22.10 | 21.42 | 37.76 | 23.483476 | 22.10 | 39.33 | 39.31 |
''' Considering the pollutant NO '''
decomp_no = sm.tsa.seasonal_decompose(df_city_day['NO'].resample('2W').mean(), model='additive')
fig1 = decomp_no.plot()
plt.show()
''' SARIMA Model for NO '''
a = b = c = range(0, 3)
p = list(itertools.product(a, b, c))
seas_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(a, b, c))]
print('SARIMAX: {} x {}'.format(p[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(p[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(p[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(p[2], seasonal_pdq[4]))
SARIMAX: (0, 0, 1) x (0, 0, 1, 12) SARIMAX: (0, 0, 1) x (0, 0, 2, 12) SARIMAX: (0, 0, 2) x (0, 1, 0, 12) SARIMAX: (0, 0, 2) x (0, 1, 1, 12)
for params in p:
for p_seasonal in seas_pdq:
try:
m = sm.tsa.statespace.SARIMAX(df_city_day['NO'].resample('2W').mean(),order=param,seasonal_order=p_seasonal,
enforce_stationarity=False, enforce_invertibility=False)
results = m.fit()
except:
continue
''' model '''
model = sm.tsa.statespace.SARIMAX(df_city_day['NO'].resample('2W').mean(), order=(2, 1, 1), seasonal_order=(1, 1, 0, 12),
enforce_stationarity=False, enforce_invertibility=False)
res = model.fit()
print(res.summary().tables[1])
============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ ar.L1 0.8449 0.107 7.897 0.000 0.635 1.055 ar.L2 -0.0304 0.088 -0.347 0.728 -0.202 0.141 ma.L1 -1.0000 188.260 -0.005 0.996 -369.982 367.982 ar.S.L12 -0.6792 0.067 -10.078 0.000 -0.811 -0.547 sigma2 12.0933 2277.059 0.005 0.996 -4450.861 4475.048 ==============================================================================
''' plotting '''
res.plot_diagnostics(figsize=(16, 8))
plt.show()
y_pred = res.get_prediction(start=pd.to_datetime('2017-01-01'), dynamic=False)
y_pred_ci = y_pred.conf_int()
ax = df_city_day['NO'].resample('2W').mean()['2017':].plot(label='observed')
y_pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7, figsize=(14, 7))
ax.fill_between(y_pred_ci.index,
y_pred_ci.iloc[:, 0],
y_pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('NO Concentration')
plt.legend(['observed', 'Forecast'])
plt.show()
''' calculating mse and rmse '''
y_f = y_pred.predicted_mean
actual = df_city_day['NO'].resample('2W').mean()['2017-01-01':]
mse = ((y_f - actual) ** 2).mean()
print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))
print('The Root Mean Squared Error of our forecasts is {}'.format(round(np.sqrt(mse), 2)))
The Mean Squared Error of our forecasts is 12.83 The Root Mean Squared Error of our forecasts is 3.58
''' considering the pollutant SO2 '''
df_city_day['NO'].resample('2W').mean().plot(figsize=(15, 6))
plt.show()
decomp_so2 = sm.tsa.seasonal_decompose(df_city_day['NO'].resample('2W').mean(), model='additive')
fig_so2 = decomp_so2.plot()
plt.show()
''' SARIMA for SO2 '''
a = b = c = range(0, 3)
p = list(itertools.product(a, b, c))
seas_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(a, b, c))]
print('SARIMAX: {} x {}'.format(p[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(p[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(p[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(p[2], seasonal_pdq[4]))
SARIMAX: (0, 0, 1) x (0, 0, 1, 12) SARIMAX: (0, 0, 1) x (0, 0, 2, 12) SARIMAX: (0, 0, 2) x (0, 1, 0, 12) SARIMAX: (0, 0, 2) x (0, 1, 1, 12)
for params in p:
for p_seasonal in seas_pdq:
try:
m = sm.tsa.statespace.SARIMAX(df_city_day['SO2'].resample('2W').mean(),order=param,seasonal_order=p_seasonal,
enforce_stationarity=False, enforce_invertibility=False)
results = m.fit()
except:
continue
mod = sm.tsa.statespace.SARIMAX(df_city_day['SO2'].resample('2W').mean(), order=(2, 1, 1), seasonal_order=(1, 1, 0, 12),
enforce_stationarity=False, enforce_invertibility=False)
res = mod.fit()
print(results.summary().tables[1])
============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ ar.L1 1.0314 0.123 8.410 0.000 0.791 1.272 ar.L2 -0.2354 0.094 -2.509 0.012 -0.419 -0.052 ma.L1 -0.9632 0.050 -19.292 0.000 -1.061 -0.865 ar.S.L12 -0.4423 0.076 -5.786 0.000 -0.592 -0.293 sigma2 4.6062 0.630 7.315 0.000 3.372 5.840 ==============================================================================
''' plotting '''
res.plot_diagnostics(figsize=(16, 8))
plt.show()
y_pred = res.get_prediction(start=pd.to_datetime('2017-01-01'), dynamic=False)
y_pred_ci = y_pred.conf_int()
ax = df_city_day['SO2'].resample('2W').mean()['2017':].plot(label='observed')
y_pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7, figsize=(14, 7))
ax.fill_between(y_pred_ci.index,
y_pred_ci.iloc[:, 0],
y_pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('NO Concentration')
plt.legend(['observed', 'Forecast'])
plt.show()
''' calculating mse and rmse '''
y_f = y_pred.predicted_mean
actual = df_city_day['SO2'].resample('2W').mean()['2017-01-01':]
mse = ((y_f - actual) ** 2).mean()
print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))
print('The Root Mean Squared Error of our forecasts is {}'.format(round(np.sqrt(mse), 2)))
The Mean Squared Error of our forecasts is 4.92 The Root Mean Squared Error of our forecasts is 2.22