# Install / load required R libraries
if (!"pacman" %in% rownames(installed.packages())) install.packages("pacman")
pacman::p_load(dplyr, tidyr, ggplot2) # required subset of tidyverse packages
pacman::p_load(httr, data.table, dtplyr, xts, eurostat, pals, pbapply, doParallel)
euCodes <- eurostat::harmonize_country_code(eurostat::eu_countries$code)
pbapply::pboptions(type = "txt", style = 3)
cpuCores <- parallel::detectCores()
cl <- parallel::makeCluster(cpuCores)
doParallel::registerDoParallel(cl)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘zoo’


xts installed

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘proxy’, ‘Rcpp’, ‘e1071’, ‘plyr’, ‘bibtex’, ‘assertthat’, ‘here’, ‘classInt’, ‘countrycode’, ‘ISOweek’, ‘RefManageR’, ‘regions’


eurostat installed

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘dichromat’, ‘mapproj’, ‘maps’


pals installed

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)


pbapply installed

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘foreach’, ‘iterators’


doParallel installed


# Function to download remote file to the disk
urlDownload <- function(urlLink, showProgress = FALSE) {
  urlHead <- curlGetHeaders(urlLink)
  fileSize <- as.numeric(gsub("^.*content-length: ([0-9]+).*$", "\\1", tolower(urlHead[5])))
  fileName <- gsub("^.*filename=(.*)\r\n$", "\\1", urlHead[grepl('filename', urlHead)])
  if (!file.exists(fileName) || file.info(fileName)$size != fileSize) {
    if (showProgress) {
      message(sprintf('Downloading %s', fileName))
      httr::GET(urlLink, httr::write_disk(fileName, overwrite = TRUE), httr::progress())
      cat('\n')
    } else {
      httr::GET(urlLink, httr::write_disk(fileName, overwrite = TRUE))
    }
  }
  return(fileName)
}


# Download the newest data
urlLocation <- 'https://aqicn.org/data-platform/covid19/report/39374-7694ec07/'
csvFile <- urlDownload(urlLocation, showProgress = TRUE)
csvFile

Downloading waqi-covid19-airqualitydata-2024.csv

  |======================================================================| 100%


# Create lists of year and quarter names
yNames <- as.character(2019:2023)
qNames <- paste0("Q", 1:4)

# Create a data frame with the url locations and year/quarter combinations
# Combine url location and year/quarter combinations into a single column
DF <- data.frame(urlLocation, crossing(yNames, qNames)) %>%
  unite(urlLocations, c(urlLocation, "yNames", "qNames"), sep="")
print(DF)

                                                           urlLocations
1  https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2019Q1
2  https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2019Q2
3  https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2019Q3
4  https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2019Q4
5  https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2020Q1
6  https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2020Q2
7  https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2020Q3
8  https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2020Q4
9  https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2021Q1
10 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2021Q2
11 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2021Q3
12 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2021Q4
13 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2022Q1
14 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2022Q2
15 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2022Q3
16 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2022Q4
17 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2023Q1
18 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2023Q2
19 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2023Q3
20 https://aqicn.org/data-platform/covid19/report/39374-7694ec07/2023Q4


# Download legacy data (sequentially)
csvFiles <- lapply(DF$urlLocations, function(x) urlDownload(x, showProgress = TRUE))

Downloading waqi-covid19-airqualitydata-2019Q1.csv

  |======================================================================| 100%

Downloading waqi-covid19-airqualitydata-2019Q2.csv

  |======================================================================| 100%

Downloading waqi-covid19-airqualitydata-2019Q3.csv

  |======================================================================| 100%

Downloading waqi-covid19-airqualitydata-2019Q4.csv

  |======================================================================| 100%

Downloading waqi-covid19-airqualitydata-2020Q1.csv

  |======================================================================| 100%


# Delete downloaded legacy data files
unlink(csvFiles)


# Download legacy data (in parallel)
csvFiles <- pbapply::pblapply(DF$urlLocations, urlDownload, cl = cl)

  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100%


# Define the columns to load
statCols <- c('count', 'min', 'max', 'median', 'variance')
mainCols <- 'median'
dropCols <- setdiff(statCols, mainCols)

# Read the newest data file and skip the first 4 lines
newTable <- data.table::fread(csvFile, skip = 4, drop = dropCols, showProgress = TRUE)

# Leave EU data, rename main column to Value
newTable <- newTable %>%
  rename(Value=all_of(mainCols)) %>%
  filter(Country %in% euCodes) %>%
  glimpse()

Rows: 36,351
Columns: 5
$ Date    <IDate> 2024-01-02, 2024-01-08, 2024-01-11, 2024-01-17, 2024-01-20, …
$ Country <chr> "HU", "HU", "HU", "HU", "HU", "HU", "HU", "HU", "HU", "HU", "H…
$ City    <chr> "Debrecen", "Debrecen", "Debrecen", "Debrecen", "Debrecen", "D…
$ Specie  <chr> "dew", "dew", "dew", "dew", "dew", "dew", "dew", "dew", "dew",…
$ Value   <dbl> 2.5, -9.0, -6.0, -3.0, -5.0, -4.0, 4.0, 5.0, -2.5, -5.0, -5.0,…


# Read legacy data files (sequentially)
fileNamesQ <- list.files('.', '^.*Q\\d.csv$')
oldTable <- do.call(rbind, lapply(fileNamesQ, function(x) data.table::fread(x, skip = 4, drop = dropCols, colClasses = list(IDate=1))))
oldTable <- oldTable %>%
  rename(Value=all_of(mainCols)) %>%
  filter(Country %in% euCodes) %>%
  glimpse()

Rows: 2,891,334
Columns: 5
$ Date    <IDate> 2019-01-03, 2019-02-05, 2019-03-04, 2019-03-26, 2019-03-27, …
$ Country <chr> "CZ", "CZ", "CZ", "CZ", "CZ", "CZ", "CZ", "CZ", "CZ", "CZ", "C…
$ City    <chr> "Prague", "Prague", "Prague", "Prague", "Prague", "Prague", "P…
$ Specie  <chr> "temperature", "temperature", "temperature", "temperature", "t…
$ Value   <dbl> -2.2, -2.0, 9.7, 5.8, 6.6, 5.8, 0.0, 6.3, 12.6, -4.4, 2.1, -1.…


# Read legacy data files (in parallel)
parallel::clusterExport(cl, c("dropCols"))
parallel::clusterEvalQ(cl, pacman::p_load(data.table))
fileNamesQ <- list.files('.', '^.*Q\\d.csv$')
oldTable <- do.call(rbind, pbapply::pblapply(fileNamesQ, function(x) fread(x, skip = 4, drop = dropCols, colClasses = list(IDate=1)), cl = cl))
oldTable <- oldTable %>%
  rename(Value=all_of(mainCols)) %>%
  filter(Country %in% euCodes) %>%
  glimpse()

  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100%
Rows: 2,891,334
Columns: 5
$ Date    <IDate> 2019-01-03, 2019-02-05, 2019-03-04, 2019-03-26, 2019-03-27, …
$ Country <chr> "CZ", "CZ", "CZ", "CZ", "CZ", "CZ", "CZ", "CZ", "CZ", "CZ", "C…
$ City    <chr> "Prague", "Prague", "Prague", "Prague", "Prague", "Prague", "P…
$ Specie  <chr> "temperature", "temperature", "temperature", "temperature", "t…
$ Value   <dbl> -2.2, -2.0, 9.7, 5.8, 6.6, 5.8, 0.0, 6.3, 12.6, -4.4, 2.1, -1.…


# Append old (2018-2023) and new (2024) data tables, sort, remove duplicates
dataTableEU <- oldTable %>% bind_rows(newTable) %>% arrange(Country, City, Date) %>%
  unique() %>%
  glimpse()

Rows: 2,771,315
Columns: 5
$ Date    <IDate> 2018-12-31, 2018-12-31, 2018-12-31, 2018-12-31, 2018-12-31, …
$ Country <chr> "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "A…
$ City    <chr> "Graz", "Graz", "Graz", "Graz", "Graz", "Graz", "Graz", "Graz"…
$ Specie  <chr> "pm10", "pressure", "wind-speed", "co", "temperature", "humidi…
$ Value   <dbl> 11.0, 1027.1, 1.5, 0.1, 3.7, 70.2, 1.1, 14.2, 4.3, 30.0, 1026.…


# Weather conditions and pollutants (PM10, PM2.5, NO2, Ozone, SO2, CO)

# Calculate the proportion of each Species in the data table
allVars <- sort(100*prop.table(table(dataTableEU$Specie)), decreasing = TRUE)

# Drop the variables that are not needed
dropWeat <- c('pressure', 'wind-speed', 'wind-gust', 'wind speed', 'wind gust', 'dew','precipitation')
dropPoll <- c('wd', 'aqi', 'uvi', 'pm1', 'neph', 'mepaqi')
keepVars <- setdiff(names(allVars), union(dropWeat, dropPoll))

# Create a new data table with the info on kept variables
setNames(data.table(t(as.numeric(allVars[keepVars]))), names(allVars[keepVars]))


# Descriptive statistics for daily values of selected variables
selectedVars <- c('temperature', 'humidity')
dataTableEU <- dataTableEU %>% filter(Specie %in% selectedVars) %>% glimpse()
cat('\n')
tapply(dataTableEU$Value, factor(dataTableEU$Specie), summary)

Rows: 548,140
Columns: 5
$ Date    <IDate> 2018-12-31, 2018-12-31, 2019-01-01, 2019-01-01, 2019-01-02, …
$ Country <chr> "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "A…
$ City    <chr> "Graz", "Graz", "Graz", "Graz", "Graz", "Graz", "Graz", "Graz"…
$ Specie  <chr> "temperature", "humidity", "temperature", "humidity", "tempera…
$ Value   <dbl> 3.7, 70.2, 3.0, 75.7, -0.2, 79.5, -1.2, 69.0, -1.4, 66.2, 0.8,…

$humidity
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   60.00   73.60   71.27   84.60  100.00 

$temperature
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -67.70    6.80   12.60   12.63   18.50   81.10


# How many cities exist for each country?
countrySize <- dataTableEU %>% select(Country, City) %>% unique() %>% group_by(Country) %>% tally(sort = TRUE) %>% rename(Cities = n)
cbind(t(head(countrySize, 13)), matrix("..",2,1), t(tail(countrySize, 6)))


# Which EU countries are we missing?
eurostat::eu_countries %>% filter(code %in% setdiff(euCodes, unique(dataTableEU$Country))) %>% select(name) %>% glimpse()

Rows: 4
Columns: 1
$ name <chr> "Latvia", "Luxembourg", "Malta", "Slovenia"


# 2021-10-03 Barcelona fix
dataTableEU %>% glimpse()
dataTableEU <- dataTableEU %>% group_by(Date, Country, City, Specie) %>% summarize(Value=mean(Value), .groups="drop") %>% glimpse()

Rows: 548,140
Columns: 5
$ Date    <IDate> 2018-12-31, 2018-12-31, 2019-01-01, 2019-01-01, 2019-01-02, …
$ Country <chr> "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "A…
$ City    <chr> "Graz", "Graz", "Graz", "Graz", "Graz", "Graz", "Graz", "Graz"…
$ Specie  <chr> "temperature", "humidity", "temperature", "humidity", "tempera…
$ Value   <dbl> 3.7, 70.2, 3.0, 75.7, -0.2, 79.5, -1.2, 69.0, -1.4, 66.2, 0.8,…
Rows: 548,137
Columns: 5
$ Date    <IDate> 2018-12-31, 2018-12-31, 2018-12-31, 2018-12-31, 2018-12-31, …
$ Country <chr> "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "B…
$ City    <chr> "Graz", "Graz", "Innsbruck", "Innsbruck", "Linz", "Linz", "Sal…
$ Specie  <chr> "humidity", "temperature", "humidity", "temperature", "humidit…
$ Value   <dbl> 70.2, 3.7, 93.0, 0.0, 92.0, 3.3, 96.5, 1.0, 70.0, 4.5, 92.6, 9…


# Create pivot table, calculate THI for each row, drop rows with missing THI values
selectedVars <- c('temperature', 'humidity')
dataTableTHI <- dataTableEU %>%
  pivot_wider(names_from=Specie, values_from=Value) %>%
  group_by(Date, Country) %>%
  mutate(across(all_of(selectedVars), ~ ifelse(is.na(.), median(.,na.rm=TRUE), .))) %>%
  mutate(THI = 0.8*temperature + (humidity/100)*(temperature-14.4) + 46.4) %>%
  filter(!is.na(THI)) %>%
  data.table %>% glimpse()

Rows: 274,160
Columns: 6
$ Date        <IDate> 2018-12-31, 2018-12-31, 2018-12-31, 2018-12-31, 2018-12-…
$ Country     <chr> "AT", "AT", "AT", "AT", "AT", "BE", "BE", "BE", "BE", "BE"…
$ City        <chr> "Graz", "Innsbruck", "Linz", "Salzburg", "Vienna", "Antwer…
$ humidity    <dbl> 70.2, 93.0, 92.0, 96.5, 70.0, 92.6, 93.0, 93.0, 92.5, 100.…
$ temperature <dbl> 3.7, 0.0, 3.3, 1.0, 4.5, 9.4, 9.4, 7.7, 8.8, 8.0, 8.8, 3.0…
$ THI         <dbl> 41.8486, 33.0080, 38.8280, 34.2690, 43.0700, 49.2900, 49.2…


# An simple function to turn an xts time series into a ggplot calendar heatmap
# https://margintale.blogspot.com/2018/01/ggplot2-time-series-heatmaps-revisited.html
xts_heatmap <- function(x) {
  data.frame(Date=as.Date(index(x)), x[,1]) %>%
    setNames(c("Date","Value")) %>%
    dplyr::mutate(
      Year=lubridate::year(Date),
      Month=lubridate::month(Date),
      # I use factors here to get plot ordering in the right order without worrying about locale
      MonthTag=factor(Month,levels=as.character(1:12),
                      labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE),
      # week start on Monday in my world
      Wday=lubridate::wday(Date,week_start=1),
      # the rev reverse here is just for the plotting order
      WdayTag=factor(Wday,levels=rev(1:7),labels=rev(c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),ordered=TRUE),
      Week=as.numeric(format(Date,"%W"))
    ) %>%
    # group by year and month and then calculate the week of the month we are currently in
    dplyr::group_by(Year,Month) %>%
    dplyr::mutate(Wmonth=1+Week-min(Week)) %>%
    dplyr::ungroup() %>%
    ggplot2::ggplot(aes(x=Wmonth, y=WdayTag, fill=Value)) +
    ggplot2::geom_tile(colour="white") +
    ggplot2::facet_grid(Year~MonthTag) +
    ggplot2::scale_fill_gradientn(colours=coolwarm(100), guide="colourbar") +
    ggplot2::labs(x="Week of Month", y=NULL)
}


# Descriptive statistics for yearly THI
myCity <- 'Kaunas'
myTable <- dataTableTHI %>% filter(City==myCity, Date>='2019-01-01') %>% select(Date, THI)
do.call(rbind, tapply(myTable$THI, year(myTable$Date), summary))


# Plot THI time series calendar heatmap
myTitle <- paste("THI @", myCity)
xts_dataset <- xts(myTable[,'THI'], order.by = as.Date(myTable$Date))
xts_heatmap(xts_dataset) + labs(title=myTitle) + coord_fixed(1) + theme_minimal(base_size=7)


# Plot THI time series dynamics
ggplot2::autoplot(xts_dataset, main=myTitle, ylab="THI", xlab="Time")


#system("jupyter nbconvert --to html ASSIST_Air_Data_Rscript.ipynb")

	Min.	1st Qu.	Median	Mean	3rd Qu.	Max.
2019	13.640	38.9960	49.53540	49.24271	59.72262	75.0560
2020	28.020	40.6560	50.18475	50.38856	59.67312	71.4790
2021	2.696	35.8275	47.17400	46.69031	58.80125	74.7400
2022	21.632	39.6800	50.30800	50.23746	60.31200	73.6255
2023	17.000	34.8800	44.34750	44.72900	53.77200	71.9085
2024	3.476	21.2165	25.91000	25.60470	33.15200	37.2840

Air Quality Data Processing with R¶

1. Prepare R environment¶

2. Download data files¶

3. Load and prepare data¶

4. Check and select vars¶

5. Pivot and calculate THI¶

6. THI statistics and plots¶

7. Export results to .html¶

temperature	humidity	pm10	no2	o3	pm25	so2	co
<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
9.893751	9.885307	9.635065	9.41784	8.339904	7.234833	6.704074	4.524098

Country	FR	ES	DE	RO	PL	IT	NL	HU	BE	AT	CZ	FI	SE	..	PT	SK	DK	EE	IE	LT
Cities	27	23	17	17	14	12	11	7	6	5	5	5	4	..	2	2	1	1	1	1