# Install custom Python packages
!pip install pyeucountrycodes --quiet
!pip install calplot --quiet

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.3/132.3 kB 2.7 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
  Building wheel for calplot (setup.py) ... done


# Load required Python packages
import os, re, requests, logging, calplot
import pandas as pd
import dask.dataframe as dd
import multiprocessing as mp
from dask.multiprocessing import get
from dask.diagnostics import ProgressBar
ProgressBar().register()
from tqdm import tqdm
from itertools import product
from eu_country_codes import COUNTRY_CODES


# Function to download remote file to the disk
def urlDownload(urlLink, showProgress = False):
  with requests.get(urlLink, stream=True) as r:
    fileSize = int(r.headers.get('Content-Length'))
    fileName = r.headers.get('Content-Disposition').split("filename=")[1]
    if not os.path.exists(fileName) or os.path.getsize(fileName) != fileSize:
      block_size = 1024
      if showProgress:
        print(f"Downloading {fileName}")
        progress_bar = tqdm(total=fileSize, unit='iB', unit_scale=True)
      with open(fileName, 'wb') as file:
        for data in r.iter_content(block_size):
          if showProgress:
            progress_bar.update(len(data))
          file.write(data)
      if showProgress:
        progress_bar.close()
    return fileName


# Download the newest data
urlLocation = 'https://aqicn.org/data-platform/covid19/report/39374-7694ec07/'
csvFile = urlDownload(urlLocation, showProgress=True)
csvFile

'waqi-covid19-airqualitydata-2024.csv'


# Create lists of year and quarter names
yNames = [str(i) for i in range(2019, 2024)]
qNames = ["Q" + str(i) for i in range(1, 5)]

# Create a data frame with the url locations and year/quarter combinations
DF = pd.DataFrame(list(product(yNames, qNames)),columns=['yNames', 'qNames'])
DF.insert(loc=0, column='urlLocation', value=urlLocation)

# Combine url location and year/quarter combinations into a single column
DF = pd.DataFrame({'urlLocations': DF.agg(''.join, axis=1)})
DF


# Download legacy data (sequentially)
csvFiles = DF.apply(lambda x : urlDownload(x[0], showProgress = True), axis=1)

Downloading waqi-covid19-airqualitydata-2019Q1.csv

100%|██████████| 26.5M/26.5M [00:03<00:00, 8.56MiB/s]

Downloading waqi-covid19-airqualitydata-2019Q2.csv

100%|██████████| 28.2M/28.2M [00:03<00:00, 8.72MiB/s]

Downloading waqi-covid19-airqualitydata-2019Q3.csv

100%|██████████| 31.6M/31.6M [00:03<00:00, 9.34MiB/s]

Downloading waqi-covid19-airqualitydata-2019Q4.csv

100%|██████████| 31.3M/31.3M [00:03<00:00, 9.57MiB/s]

Downloading waqi-covid19-airqualitydata-2020Q1.csv

100%|██████████| 29.1M/29.1M [00:03<00:00, 8.59MiB/s]


# Delete downloaded legacy data files
for delFile in csvFiles:
  os.unlink(delFile)


# Download legacy data (in parallel)
DDF = dd.from_pandas(DF, npartitions=mp.cpu_count())
csvFiles = DDF.apply(lambda x : urlDownload(x[0]), axis=1, meta=pd.Series(dtype="str")).compute(scheduler='threads')

[########################################] | 100% Completed | 41.36 s


# Define the columns to load
meta_cols = ['Date', 'Country', 'City', 'Specie']
main_column = 'median' # 'count', 'min', 'max', 'median', 'variance'
selected_cols = meta_cols + [main_column]

# Read the newest data file and skip the first 4 lines
DF = pd.read_csv(csvFile, skiprows=4, usecols=selected_cols)

# Leave EU data, rename main column to Value
selectEU = DF['Country'].isin(COUNTRY_CODES)
newTable = DF[selectEU].rename(columns={main_column: 'Value'})
print(newTable)

              Date Country      City         Specie  Value
19346   2024-01-02      HU  Debrecen            dew    2.5
19347   2024-01-08      HU  Debrecen            dew   -9.0
19348   2024-01-11      HU  Debrecen            dew   -6.0
19349   2024-01-17      HU  Debrecen            dew   -3.0
19350   2024-01-20      HU  Debrecen            dew   -5.0
...            ...     ...       ...            ...    ...
127266  2024-01-19      AT      Graz  precipitation   99.4
127267  2024-01-01      AT      Graz  precipitation    1.2
127268  2024-01-06      AT      Graz  precipitation    0.5
127269  2024-01-07      AT      Graz  precipitation    0.3
127270  2024-01-08      AT      Graz  precipitation   99.1

[36351 rows x 5 columns]


# Read legacy data files (sequentially)
fileNamesQ = [f for f in os.listdir('.') if re.match(r'^.*Q\d.csv$', f)]
DF = pd.concat((pd.read_csv(f, skiprows=4, usecols=selected_cols) for f in fileNamesQ), ignore_index=True)
selectEU = DF['Country'].isin(COUNTRY_CODES)
oldTable = DF[selectEU].rename(columns={main_column: 'Value'})
print(oldTable)

                Date Country      City    Specie   Value
27384     2020-05-30      AT  Salzburg      pm10    10.0
27385     2020-06-12      AT  Salzburg      pm10    10.0
27386     2020-04-14      AT  Salzburg      pm10    10.0
27387     2020-04-17      AT  Salzburg      pm10    19.0
27388     2020-04-25      AT  Salzburg      pm10    25.0
...              ...     ...       ...       ...     ...
10102970  2020-03-10      LT    Kaunas  pressure  1010.0
10102971  2020-01-04      LT    Kaunas  pressure  1003.0
10102972  2020-03-30      LT    Kaunas  pressure  1019.0
10102973  2020-03-31      LT    Kaunas  pressure  1021.5
10102974  2020-03-29      LT    Kaunas  pressure  1013.0

[2891334 rows x 5 columns]


# Read legacy data files (in parallel)
fileNamesQ = [f for f in os.listdir('.') if re.match(r'^.*Q\d.csv$', f)]
DF = dd.compute(dd.read_csv(fileNamesQ, skiprows=4, usecols=selected_cols))[0]
selectEU = DF['Country'].isin(COUNTRY_CODES)
oldTable = DF[selectEU].rename(columns={main_column: 'Value'})
print(oldTable)

[########################################] | 100% Completed | 9.01 s
              Date Country      City    Specie   Value
27384   2020-05-30      AT  Salzburg      pm10    10.0
27385   2020-06-12      AT  Salzburg      pm10    10.0
27386   2020-04-14      AT  Salzburg      pm10    10.0
27387   2020-04-17      AT  Salzburg      pm10    19.0
27388   2020-04-25      AT  Salzburg      pm10    25.0
...            ...     ...       ...       ...     ...
539568  2020-03-10      LT    Kaunas  pressure  1010.0
539569  2020-01-04      LT    Kaunas  pressure  1003.0
539570  2020-03-30      LT    Kaunas  pressure  1019.0
539571  2020-03-31      LT    Kaunas  pressure  1021.5
539572  2020-03-29      LT    Kaunas  pressure  1013.0

[2891334 rows x 5 columns]


# Append old (2018-2023) and new (2024) data tables, sort, remove duplicates
DF = pd.concat([oldTable, newTable])
dataTableEU = DF.sort_values(by=['Country', 'City', 'Date']).drop_duplicates()
print(dataTableEU)

             Date Country    City       Specie   Value
10266  2018-12-31      AT    Graz         pm10    11.0
10392  2018-12-31      AT    Graz     pressure  1027.1
10505  2018-12-31      AT    Graz   wind-speed     1.5
10547  2018-12-31      AT    Graz           co     0.1
10673  2018-12-31      AT    Graz  temperature     3.7
...           ...     ...     ...          ...     ...
77191  2024-01-23      SK  Košice     humidity    94.0
77196  2024-01-23      SK  Košice    wind-gust     2.7
77209  2024-01-23      SK  Košice          dew    -4.0
77245  2024-01-23      SK  Košice  temperature    -3.3
77266  2024-01-23      SK  Košice          so2     4.6

[2771315 rows x 5 columns]


# Weather conditions and pollutants (PM10, PM2.5, NO2, Ozone, SO2, CO)

# Calculate the proportion of each Species in the data table
all_vars = 100 * pd.value_counts(dataTableEU.Specie) / len(dataTableEU)

# Drop the variables that are not needed
drop_weat = ['pressure', 'wind-speed', 'wind-gust', 'wind speed', 'wind gust', 'dew', 'precipitation']
drop_poll = ['wd', 'aqi', 'uvi', 'pm1', 'neph', 'mepaqi']
keep_vars = set(all_vars.index) - set(drop_weat + drop_poll)

# Create a new data table with the info on kept variables
new_data_table = pd.DataFrame([all_vars[list(keep_vars)].sort_values(ascending=False)])
new_data_table.style.hide(axis="index")


# Descriptive statistics for daily values of selected variables
selectedVars = ['temperature', 'humidity']
selectedIdx = dataTableEU['Specie'].isin(selectedVars)
dataTableEU = dataTableEU[selectedIdx]
dataTableEU.groupby('Specie')['Value'].describe()


# 2021-10-03 Barcelona fix
print(dataTableEU)
dataTableEU = dataTableEU.groupby(['Date', 'Country', 'City', 'Specie'])[['Value']].mean().reset_index()
print(dataTableEU)

             Date Country    City       Specie  Value
10673  2018-12-31      AT    Graz  temperature    3.7
10751  2018-12-31      AT    Graz     humidity   70.2
10637  2019-01-01      AT    Graz  temperature    3.0
10771  2019-01-01      AT    Graz     humidity   75.7
10710  2019-01-02      AT    Graz  temperature   -0.2
...           ...     ...     ...          ...    ...
77244  2024-01-21      SK  Košice  temperature   -2.5
77176  2024-01-22      SK  Košice     humidity   94.0
77249  2024-01-22      SK  Košice  temperature   -3.8
77191  2024-01-23      SK  Košice     humidity   94.0
77245  2024-01-23      SK  Košice  temperature   -3.3

[548140 rows x 5 columns]
              Date Country        City       Specie  Value
0       2018-12-31      AT        Graz     humidity   70.2
1       2018-12-31      AT        Graz  temperature    3.7
2       2018-12-31      AT   Innsbruck     humidity   93.0
3       2018-12-31      AT   Innsbruck  temperature    0.0
4       2018-12-31      AT        Linz     humidity   92.0
...            ...     ...         ...          ...    ...
548132  2024-01-23      SE     Uppsala  temperature    4.0
548133  2024-01-23      SK  Bratislava     humidity   91.0
548134  2024-01-23      SK  Bratislava  temperature   -1.8
548135  2024-01-23      SK      Košice     humidity   94.0
548136  2024-01-23      SK      Košice  temperature   -3.3

[548137 rows x 5 columns]


# Create pivot table, calculate THI for each row, drop rows with missing THI values
dataTableTHI = dataTableEU.pivot_table(index=['Date', 'Country', 'City'], columns='Specie', values='Value').reset_index()
dataTableTHI["THI"] = 0.8 * dataTableTHI.temperature + (dataTableTHI.humidity/100)*(dataTableTHI.temperature-14.4) + 46.4
dataTableTHI = dataTableTHI.dropna(subset=["THI"])
print(dataTableTHI)

Specie        Date Country        City  humidity  temperature      THI
0       2018-12-31      AT        Graz      70.2          3.7  41.8486
1       2018-12-31      AT   Innsbruck      93.0          0.0  33.0080
2       2018-12-31      AT        Linz      92.0          3.3  38.8280
3       2018-12-31      AT    Salzburg      96.5          1.0  34.2690
4       2018-12-31      AT      Vienna      70.0          4.5  43.0700
...            ...     ...         ...       ...          ...      ...
274188  2024-01-23      SE       Malmö      89.0          4.4  41.0200
274189  2024-01-23      SE   Stockholm      75.0          5.0  43.3500
274190  2024-01-23      SE     Uppsala      80.0          4.0  41.2800
274191  2024-01-23      SK  Bratislava      91.0         -1.8  30.2180
274192  2024-01-23      SK      Košice      94.0         -3.3  27.1220

[273944 rows x 6 columns]


# Descriptive statistics for yearly THI
myCity = 'Kaunas'
myTable = dataTableTHI[(dataTableTHI['City']==myCity) & (dataTableTHI['Date']>='2019-01-01')][["Date", "THI"]]
myTable.groupby(pd.to_datetime(myTable['Date']).dt.year).describe()


# Plot THI time series calendar heatmap
pdTimeSeries = pd.Series(myTable['THI'].values, index=pd.DatetimeIndex(myTable['Date']))
logging.getLogger('matplotlib.font_manager').disabled = True
cp = calplot.calplot(pdTimeSeries, dropzero=True, cmap='coolwarm', suptitle='THI @ ' + myCity)


# Plot THI time series dynamics
ts = pdTimeSeries.plot(title='THI @ ' + myCity, ylabel="THI", xlabel="Time", lw=1, grid=True)


#!jupyter nbconvert --to html ASSIST_Air_Data_Python.ipynb

	urlLocations
0	https://aqicn.org/data-platform/covid19/report...
1	https://aqicn.org/data-platform/covid19/report...
2	https://aqicn.org/data-platform/covid19/report...
3	https://aqicn.org/data-platform/covid19/report...
4	https://aqicn.org/data-platform/covid19/report...
5	https://aqicn.org/data-platform/covid19/report...
6	https://aqicn.org/data-platform/covid19/report...
7	https://aqicn.org/data-platform/covid19/report...
8	https://aqicn.org/data-platform/covid19/report...
9	https://aqicn.org/data-platform/covid19/report...
10	https://aqicn.org/data-platform/covid19/report...
11	https://aqicn.org/data-platform/covid19/report...
12	https://aqicn.org/data-platform/covid19/report...
13	https://aqicn.org/data-platform/covid19/report...
14	https://aqicn.org/data-platform/covid19/report...
15	https://aqicn.org/data-platform/covid19/report...
16	https://aqicn.org/data-platform/covid19/report...
17	https://aqicn.org/data-platform/covid19/report...
18	https://aqicn.org/data-platform/covid19/report...
19	https://aqicn.org/data-platform/covid19/report...

	count	mean	std	min	25%	50%	75%	max
Specie
humidity	273953.0	71.272162	16.891712	0.0	60.0	73.6	84.6	100.0
temperature	274187.0	12.625512	8.099295	-67.7	6.8	12.6	18.5	81.1

Specie	THI
	count	mean	std	min	25%	50%	75%	max
Date
2019	362.0	49.242705	12.738765	13.640	38.9960	49.53540	59.722625	75.0560
2020	348.0	50.388563	10.801356	28.020	40.6560	50.18475	59.673125	71.4790
2021	362.0	46.690309	14.882599	2.696	35.8275	47.17400	58.801250	74.7400
2022	281.0	50.237457	12.161641	21.632	39.6800	50.30800	60.312000	73.6255
2023	281.0	44.728998	11.624320	17.000	34.8800	44.34750	53.772000	71.9085
2024	23.0	25.604696	8.608295	3.476	21.2165	25.91000	33.152000	37.2840

Air Quality Data Processing with Python¶

1. Prepare Python environment¶

2. Download data files¶

3. Load and prepare data¶

4. Check and select vars¶

5. Pivot and calculate THI¶

6. THI statistics and plots¶

7. Export results to .html¶