Background

When analysing Ta exposure for the SEPAGES cohort, Emie and Arianne noticed that some exposures are missing. The problem seems to be caused by the fact that the 1 km grid cell coordinates vary between years.

Exploration

knitr::opts_knit$set(root.dir = here::here())
knitr::opts_knit$set(echo = FALSE)
library(magrittr)
library(glue)
library(fst)
library(data.table)

1. Check whether grid coordinates change between 2015 and 2017

We start with a quick check on one day of data for 2015 and 2017.

# Set the path to the directory that contains the final 1 km Ta model predictions
expo_dir <- "/media/summer_modeles_expo/temperature_hough_1km"
# Load first day of final 1 km Ta predictions for 2015 and 2017
ncells <- glue("{expo_dir}/mod3_2015_pred_1km.fst") %>% 
          fst() %>%
          nrow() %>%
          divide_by(365)
jan1_2015 <- glue("{expo_dir}/mod3_2015_pred_1km.fst") %>% 
             read_fst(to = ncells, as.data.table = TRUE)
jan1_2017 <- glue("{expo_dir}/mod3_2017_pred_1km.fst") %>% 
             read_fst(to = ncells, as.data.table = TRUE)
pryr::mem_used()
209 MB
# Confirm coordinates are same between years
print("Checking coordinates for Jan 1, 2015 / 2017")
[1] "Checking coordinates for Jan 1, 2015 / 2017"
identical(jan1_2015$grd_1km_id, jan1_2017$grd_1km_id) %>% 
  paste("Jan 1 2015 vs 2017: id identical?", .)
[1] "Jan 1 2015 vs 2017: id identical? TRUE"
identical(jan1_2015[, .(grd_1km_x, grd_1km_y)], jan1_2017[, .(grd_1km_x, grd_1km_y)]) %>% 
  paste("Jan 1 2015 vs 2017: x and y identical?", .)
[1] "Jan 1 2015 vs 2017: x and y identical? FALSE"

The IDs are identical, but the coordinates differ from 2015 to 2017. However, the coordinates are floating point numbers and so should not be assumed to be exactly equal. Comparing with all.equal shows that they differ by no more than 1 nanometer:

all.equal(
  jan1_2015[, .(grd_1km_x, grd_1km_y)],
  jan1_2017[, .(grd_1km_x, grd_1km_y)],
  tolerance = 1e-9
) %>% 
  paste("Jan 1 2015 vs 2017: x and y equal to within 1 nanometer?", .)
[1] "Jan 1 2015 vs 2017: x and y equal to within 1 nanometer? TRUE"
print("Jan 1 2015 vs 2017 grd_1km_x")
[1] "Jan 1 2015 vs 2017 grd_1km_x"
diff_idx <- which(jan1_2015$grd_1km_x != jan1_2017$grd_1km_x)[1:3]
c("2015:", jan1_2015[diff_idx, format(grd_1km_x, nsmall = 10)])
[1] "2015:"             "661279.2366867308" "658520.7071575987" "659451.2811964190"
c("2017:", jan1_2017[diff_idx, format(grd_1km_x, nsmall = 10)])
[1] "2017:"             "661279.2366867306" "658520.7071575986" "659451.2811964189"

2. Check coordinates for entire years

Now we confirm there are no substantial differences between the coordinates for the entire years.

# Load entire years
# cols <- c("date", "grd_1km_id", "grd_1km_x", "grd_1km_y", "tmin")
# ta_2015 <- read_fst(glue("{expo_dir}/mod3_2015_pred_1km.fst"), as.data.table = TRUE, columns = cols)
# ta_2017 <- read_fst(glue("{expo_dir}/mod3_2017_pred_1km.fst"), as.data.table = TRUE, columns = cols)
pryr::mem_used()
18.5 GB
# Confirm coordinates are same in 2015 and 2017
identical(ta_2015$grd_1km_id, ta_2017$grd_1km_id) %>% 
  paste("2015 vs 2017: id identical?", .)
[1] "2015 vs 2017: id identical? TRUE"
identical(ta_2015[, .(grd_1km_x, grd_1km_y)], ta_2017[, .(grd_1km_x, grd_1km_y)]) %>% 
  paste("2015 vs 2017: x and y identical?", .)
[1] "2015 vs 2017: x and y identical? FALSE"
all.equal(ta_2015[, .(grd_1km_x, grd_1km_y)], ta_2017[, .(grd_1km_x, grd_1km_y)]) %>% 
  paste("2015 vs 2017: x and y equal to within 1 nanometer?", .)

Again, the IDs are identical in all years and the coordinates differ by less than 1 nanometer between 2015 and 2017. Within a year the coordinates are identical.

setNumericRounding(0) # Confirm data.table does not round when checking uniqueness
ta_2015[, .(locs = uniqueN(.SD)), .SDcols = c("grd_1km_x", "grd_1km_y"), by = .(grd_1km_id)] %>%
  .[locs > 1, .N] %>%
  paste("2015 grid cells with varying coords:", .)
[1] "2015 grid cells with varying coords: 0"
ta_2017[, .(locs = uniqueN(.SD)), .SDcols = c("grd_1km_x", "grd_1km_y"), by = .(grd_1km_id)] %>%
  .[locs > 1, .N] %>%
  paste("2017 grid cells with varying coords:", .)
[1] "2017 grid cells with varying coords: 0"

3. Calculate exposure for SEPAGES

Now we check to see if the miniscule difference in coordinates is the source of the problems with the calculated exposures. First we load the matched grid coordinates for each participant address and confirm that the column mother_token_nbmoove is a unique row identifier.

# Load SEPAGES subject addresses matched to Ta model grid cell coordinates
db_address_expo <- readRDS("~/db_address_expo.rds") %>%
                   setkeyv(c("mother_token", "nbmoove", "start_date", "end_date"))
db_address_expo[, .N, by = .(mother_token_nbmoove)] %>%
  .[N > 1, .N] %>%
  paste("Non-unique mother_token_nbmoove:", .) %>%
  print()
[1] "Non-unique mother_token_nbmoove: 0"

Now we calculate exposure for 2015 and 2017 by matching the coordinates of each subject-location to the exposure model. This is dangerous because coordinates are floating-point values and so may differ very slightly if any calculation was performed on them (and maybe between machines???).

# Get Ta exposure for 2015 and 2017
setkeyv(ta_2015, c("date", "grd_1km_id", "grd_1km_x", "grd_1km_y"))
setkeyv(ta_2017, c("date", "grd_1km_id", "grd_1km_x", "grd_1km_y"))
day_xy <- db_address_expo[, .(grd_1km_x, grd_1km_y, date = seq.Date(start_date, end_date, by = 1)),
                          keyby = .(mother_token_nbmoove)]
expo_2015 <- day_xy[year(date) == 2015, ] %>%
             ta_2015[., on = .(date, grd_1km_x, grd_1km_y)] %>%
             setkeyv(c("mother_token_nbmoove", "date")) %>%
             setcolorder(c("mother_token_nbmoove", "date", "grd_1km_id"))
expo_2017 <- day_xy[year(date) == 2017, ] %>%
             ta_2017[., on = .(date, grd_1km_x, grd_1km_y)] %>%
             setkeyv(c("mother_token_nbmoove", "date")) %>%
             setcolorder(c("mother_token_nbmoove", "date", "grd_1km_id"))
paste("Subject location-days with no 2015 exposure:", expo_2015[is.na(grd_1km_id), .N])
[1] "Subject location-days with no 2015 exposure: 106"
paste("Subject location-days with no 2017 exposure:", expo_2017[is.na(grd_1km_id), .N])
[1] "Subject location-days with no 2017 exposure: 86724"

If we look at one subject-location for a few days in 2015 and 2017 we see that we have exposure for 2015 but not 2015, despite no apparent difference in the coordinates.

smpl <- rbind(
          expo_2015[mother_token_nbmoove == "AB054987_1", ][1:3, ],
          expo_2017[mother_token_nbmoove == "AB054987_1", ][1:3, ]
        )
smpl

However, if we look at the full coordinates we see that in 2015 they match but in 2017 they differ very slightly from the coordinates in the Ta model:

print("Full subject-location coordinates:")
[1] "Full subject-location coordinates:"
xy <- day_xy[mother_token_nbmoove == "AB054987_1", .(grd_1km_x, grd_1km_y)] %>%
      unique()
format(xy, nsmall = 10)
     grd_1km_x           grd_1km_y           
[1,] "718921.0712282599" "6494067.2242461294"
print("Corresponding 2015 Ta model coordinates:")
[1] "Corresponding 2015 Ta model coordinates:"
ta_2015[between(grd_1km_x, 718921, 718922) & between(grd_1km_y, 6494067, 6494068),
        .(grd_1km_x, grd_1km_y)] %>%
  unique() %>%
  format(nsmall = 10)
     grd_1km_x           grd_1km_y           
[1,] "718921.0712282599" "6494067.2242461294"
print("Corresponding 2017 Ta model coordinates:")
[1] "Corresponding 2017 Ta model coordinates:"
ta_2017[between(grd_1km_x, 718921, 718922) & between(grd_1km_y, 6494067, 6494068),
        .(grd_1km_x, grd_1km_y)] %>%
  unique() %>%
  format(nsmall = 10)
     grd_1km_x           grd_1km_y           
[1,] "718921.0712282599" "6494067.2242461275"

The robust solution to this problem is to join by the Ta model grid id, grd_1km_id, which is a string. In general, joining by floats is dangerous. In this case an alternative quick fix is to set data.table to round the least-significant 2 bytes off of floats when comparing:

setNumericRounding(2)
smpl[year(date) == 2017, .(mother_token_nbmoove, date, grd_1km_x, grd_1km_y)] %>% 
  ta_2017[., on = .(date, grd_1km_x, grd_1km_y)] %>% 
  setcolorder(c("mother_token_nbmoove", "date", "grd_1km_id")) %>% 
  print()

We now match the Ta exposure data. If we rematch for all subject-location-days we see that the number of missing exposures is unchanged for 2015, and greatly reduced in 2017:

expo_2015 <- day_xy[year(date) == 2015, ] %>%
             ta_2015[., on = .(date, grd_1km_x, grd_1km_y)] %>%
             setkeyv(c("mother_token_nbmoove", "date")) %>%
             setcolorder(c("mother_token_nbmoove", "date", "grd_1km_id"))
expo_2017 <- day_xy[year(date) == 2017, ] %>%
             ta_2017[., on = .(date, grd_1km_x, grd_1km_y)] %>%
             setkeyv(c("mother_token_nbmoove", "date")) %>%
             setcolorder(c("mother_token_nbmoove", "date", "grd_1km_id"))
paste("Subject location-days with no 2015 exposure:", expo_2015[is.na(grd_1km_id), .N])
[1] "Subject location-days with no 2015 exposure: 106"
paste("Subject location-days with no 2017 exposure:", expo_2017[is.na(grd_1km_id), .N])
[1] "Subject location-days with no 2017 exposure: 141"
---
title: "2020-05-29 Explore differences in 1 km grid coordinates between years"
output:
  html_notebook: 
    code_folding: hide
---

## Background

When analysing Ta exposure for the SEPAGES cohort, Emie and Arianne noticed that some exposures are missing. The problem seems to be caused by the fact that the 1 km grid cell coordinates vary between years.

## Exploration

```{r setup}
knitr::opts_knit$set(root.dir = here::here())
knitr::opts_knit$set(echo = FALSE)

library(magrittr)
library(glue)
library(fst)
library(data.table)
```

### 1. Check whether grid coordinates change between 2015 and 2017

We start with a quick check on one day of data for 2015 and 2017.

```{r check-jan1}
# Set the path to the directory that contains the final 1 km Ta model predictions
expo_dir <- "/media/summer_modeles_expo/temperature_hough_1km"

# Load first day of final 1 km Ta predictions for 2015 and 2017
ncells <- glue("{expo_dir}/mod3_2015_pred_1km.fst") %>% 
          fst() %>%
          nrow() %>%
          divide_by(365)
jan1_2015 <- glue("{expo_dir}/mod3_2015_pred_1km.fst") %>% 
             read_fst(to = ncells, as.data.table = TRUE)
jan1_2017 <- glue("{expo_dir}/mod3_2017_pred_1km.fst") %>% 
             read_fst(to = ncells, as.data.table = TRUE)
pryr::mem_used()


# Confirm coordinates are same between years
print("Checking coordinates for Jan 1, 2015 / 2017")

identical(jan1_2015$grd_1km_id, jan1_2017$grd_1km_id) %>% 
  paste("Jan 1 2015 vs 2017: id identical?", .)

identical(jan1_2015[, .(grd_1km_x, grd_1km_y)], jan1_2017[, .(grd_1km_x, grd_1km_y)]) %>% 
  paste("Jan 1 2015 vs 2017: x and y identical?", .)
```

The IDs are identical, but the coordinates differ from 2015 to 2017. However, the coordinates are floating point numbers and so should not be assumed to be exactly equal. Comparing with `all.equal` shows that they differ by no more than 1 nanometer:

```{r jan1-details}
all.equal(
  jan1_2015[, .(grd_1km_x, grd_1km_y)],
  jan1_2017[, .(grd_1km_x, grd_1km_y)],
  tolerance = 1e-9
) %>% 
  paste("Jan 1 2015 vs 2017: x and y equal to within 1 nanometer?", .)


print("Jan 1 2015 vs 2017 grd_1km_x")
diff_idx <- which(jan1_2015$grd_1km_x != jan1_2017$grd_1km_x)[1:3]
c("2015:", jan1_2015[diff_idx, format(grd_1km_x, nsmall = 10)])
c("2017:", jan1_2017[diff_idx, format(grd_1km_x, nsmall = 10)])

```

### 2. Check coordinates for entire years

Now we confirm there are no substantial differences between the coordinates for the entire years.

```{r check-between-years}
# Load entire years
cols <- c("date", "grd_1km_id", "grd_1km_x", "grd_1km_y", "tmin")
ta_2015 <- read_fst(glue("{expo_dir}/mod3_2015_pred_1km.fst"), as.data.table = TRUE, columns = cols)
ta_2017 <- read_fst(glue("{expo_dir}/mod3_2017_pred_1km.fst"), as.data.table = TRUE, columns = cols)
pryr::mem_used()

# Confirm coordinates are same in 2015 and 2017
identical(ta_2015$grd_1km_id, ta_2017$grd_1km_id) %>% 
  paste("2015 vs 2017: id identical?", .)

identical(ta_2015[, .(grd_1km_x, grd_1km_y)], ta_2017[, .(grd_1km_x, grd_1km_y)]) %>% 
  paste("2015 vs 2017: x and y identical?", .)

all.equal(ta_2015[, .(grd_1km_x, grd_1km_y)], ta_2017[, .(grd_1km_x, grd_1km_y)]) %>% 
  paste("2015 vs 2017: x and y equal to within 1 nanometer?", .)

```

Again, the IDs are identical in all years and the coordinates differ by less than 1 nanometer between 2015 and 2017. Within a year the coordinates are identical.

```{r check-within-year}
setNumericRounding(0) # Confirm data.table does not round when checking uniqueness

ta_2015[, .(locs = uniqueN(.SD)), .SDcols = c("grd_1km_x", "grd_1km_y"), by = .(grd_1km_id)] %>%
  .[locs > 1, .N] %>%
  paste("2015 grid cells with varying coords:", .)

ta_2017[, .(locs = uniqueN(.SD)), .SDcols = c("grd_1km_x", "grd_1km_y"), by = .(grd_1km_id)] %>%
  .[locs > 1, .N] %>%
  paste("2017 grid cells with varying coords:", .)

```


### 3. Calculate exposure for SEPAGES

Now we check to see if the miniscule difference in coordinates is the source of the problems with the calculated exposures. First we load the matched grid coordinates for each participant address and confirm that the column `mother_token_nbmoove` is a unique row identifier.

```{r load-cohort}
# Load SEPAGES subject addresses matched to Ta model grid cell coordinates
db_address_expo <- readRDS("~/db_address_expo.rds") %>%
                   setkeyv(c("mother_token", "nbmoove", "start_date", "end_date"))
db_address_expo[, .N, by = .(mother_token_nbmoove)] %>%
  .[N > 1, .N] %>%
  paste("Non-unique mother_token_nbmoove:", .) %>%
  print()
```

Now we calculate exposure for 2015 and 2017 by matching the coordinates of each subject-location to the exposure model. This is dangerous because coordinates are floating-point values and so may differ very slightly if any calculation was performed on them (and maybe between machines???).

```{r calc-exposure}
# Get Ta exposure for 2015 and 2017
setkeyv(ta_2015, c("date", "grd_1km_id", "grd_1km_x", "grd_1km_y"))
setkeyv(ta_2017, c("date", "grd_1km_id", "grd_1km_x", "grd_1km_y"))
day_xy <- db_address_expo[, .(grd_1km_x, grd_1km_y, date = seq.Date(start_date, end_date, by = 1)),
                          keyby = .(mother_token_nbmoove)]
expo_2015 <- day_xy[year(date) == 2015, ] %>%
             ta_2015[., on = .(date, grd_1km_x, grd_1km_y)] %>%
             setkeyv(c("mother_token_nbmoove", "date")) %>%
             setcolorder(c("mother_token_nbmoove", "date", "grd_1km_id"))
expo_2017 <- day_xy[year(date) == 2017, ] %>%
             ta_2017[., on = .(date, grd_1km_x, grd_1km_y)] %>%
             setkeyv(c("mother_token_nbmoove", "date")) %>%
             setcolorder(c("mother_token_nbmoove", "date", "grd_1km_id"))
paste("Subject location-days with no 2015 exposure:", expo_2015[is.na(grd_1km_id), .N])
paste("Subject location-days with no 2017 exposure:", expo_2017[is.na(grd_1km_id), .N])
```

If we look at one subject-location for a few days in 2015 and 2017 we see that we have exposure for 2015 but not 2015, despite no apparent difference in the coordinates.

```{r check-exposure}
smpl <- rbind(
          expo_2015[mother_token_nbmoove == "AB054987_1", ][1:3, ],
          expo_2017[mother_token_nbmoove == "AB054987_1", ][1:3, ]
        )
smpl
```

However, if we look at the full coordinates we see that in 2015 they match but in 2017 they differ very slightly from the coordinates in the Ta model:

```{r example-subj-loc}
print("Full subject-location coordinates:")
xy <- day_xy[mother_token_nbmoove == "AB054987_1", .(grd_1km_x, grd_1km_y)] %>%
      unique()
format(xy, nsmall = 10)

print("Corresponding 2015 Ta model coordinates:")
ta_2015[between(grd_1km_x, 718921, 718922) & between(grd_1km_y, 6494067, 6494068),
        .(grd_1km_x, grd_1km_y)] %>%
  unique() %>%
  format(nsmall = 10)

print("Corresponding 2017 Ta model coordinates:")
ta_2017[between(grd_1km_x, 718921, 718922) & between(grd_1km_y, 6494067, 6494068),
        .(grd_1km_x, grd_1km_y)] %>%
  unique() %>%
  format(nsmall = 10)

```

The robust solution to this problem is to join by the Ta model grid id, `grd_1km_id`, which is a string. In general, joining by floats is dangerous. In this case an alternative quick fix is to set data.table to round the least-significant 2 bytes off of floats when comparing:

```{r use-rounding}
setNumericRounding(2)
smpl[year(date) == 2017, .(mother_token_nbmoove, date, grd_1km_x, grd_1km_y)] %>% 
  ta_2017[., on = .(date, grd_1km_x, grd_1km_y)] %>% 
  setcolorder(c("mother_token_nbmoove", "date", "grd_1km_id")) %>% 
  print()

```

We now match the Ta exposure data. If we rematch for all subject-location-days we see that the number of missing exposures is unchanged for 2015, and greatly reduced in 2017:

```{r recalc-exposure}
expo_2015 <- day_xy[year(date) == 2015, ] %>%
             ta_2015[., on = .(date, grd_1km_x, grd_1km_y)] %>%
             setkeyv(c("mother_token_nbmoove", "date")) %>%
             setcolorder(c("mother_token_nbmoove", "date", "grd_1km_id"))
expo_2017 <- day_xy[year(date) == 2017, ] %>%
             ta_2017[., on = .(date, grd_1km_x, grd_1km_y)] %>%
             setkeyv(c("mother_token_nbmoove", "date")) %>%
             setcolorder(c("mother_token_nbmoove", "date", "grd_1km_id"))
paste("Subject location-days with no 2015 exposure:", expo_2015[is.na(grd_1km_id), .N])
paste("Subject location-days with no 2017 exposure:", expo_2017[is.na(grd_1km_id), .N])

```
