Load Data

Show Code

# load dates data
dates_orig <- load_dates_data(DATA_PATH)

# load motes data
motes_orig <- load_mote_location_data(DATA_PATH)

# load redwood data
redwood_all_orig <- load_redwood_data(DATA_PATH, source = "all")
redwood_net_orig <- load_redwood_data(DATA_PATH, source = "net")
redwood_log_orig <- load_redwood_data(DATA_PATH, source = "log")

Quick Look

Show Code

dates_orig

# A tibble: 13,000 × 3
   number date                        day
    <int> <chr>                     <dbl>
 1      1 Tue Apr 27 17:10:00 2004 12536.
 2      2 Tue Apr 27 17:15:00 2004 12536.
 3      3 Tue Apr 27 17:20:00 2004 12536.
 4      4 Tue Apr 27 17:25:00 2004 12536.
 5      5 Tue Apr 27 17:30:00 2004 12536.
 6      6 Tue Apr 27 17:35:00 2004 12536.
 7      7 Tue Apr 27 17:40:00 2004 12536.
 8      8 Tue Apr 27 17:45:00 2004 12536.
 9      9 Tue Apr 27 17:50:00 2004 12536.
10     10 Tue Apr 27 17:55:00 2004 12536.
# ℹ 12,990 more rows

Show Code

skimr::skim(dates_orig)

Data summary
Name	dates_orig
Number of rows	13000
Number of columns	3
_______________________
Column type frequency:
character	1
numeric	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
date	0	1	24	24	0	13000	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
number	0	1	6500.50	3752.92	1.00	3250.75	6500.50	9750.25	13000.00	▇▇▇▇▇
day	0	1	12558.57	13.03	12536.01	12547.29	12558.57	12569.86	12581.14	▇▇▇▇▇

Show Code

motes_orig

# A tibble: 80 × 5
      ID Height Direc  Dist Tree 
   <int>  <dbl> <chr> <dbl> <chr>
 1    24   10.5 WSW     0.1 edge 
 2    20   12.7 WSW     0.1 edge 
 3    27   14.9 WSW     0.1 edge 
 4    38   16.6 WSW     0.1 edge 
 5     0   16.9 NW      0.1 edge 
 6    67   19.7 SW      0.1 edge 
 7    55   21.5 SW      0.1 edge 
 8   114   23.3 SW      0.1 edge 
 9   198   24.8 NW      0.1 edge 
10   111   26   WSW     0.1 edge 
# ℹ 70 more rows

Show Code

skimr::skim(motes_orig)

Data summary
Name	motes_orig
Number of rows	80
Number of columns	5
_______________________
Column type frequency:
character	2
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
Direc	0	1	1	3	0	9	0
Tree	0	1	4	8	0	2	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ID	1	84.92	53.31	0.0	40.75	79.0	124.0	200.0	▇▇▆▆▂
Height	1	42.32	13.79	10.5	33.27	44.9	52.1	66.5	▂▃▆▇▃
Dist	1	0.82	1.34	0.1	0.10	0.1	1.0	5.0	▇▁▂▁▁

Show Code

redwood_all_orig

# A tibble: 416,036 × 11
   result_time         epoch nodeid parent voltage depth humidity humid_temp
   <dttm>              <int>  <int>  <int>   <dbl> <int>    <dbl>      <dbl>
 1 2004-05-07 18:24:58  2812    119      5     220     2     95.4       12.7
 2 2004-05-07 18:24:58  2812    105    129     223     3     97.0       12.6
 3 2004-05-07 18:24:59  2812    113    118     222     4     94.5       12.5
 4 2004-05-07 18:24:59  2812    138      5     223     2     96.9       12.7
 5 2004-05-07 18:24:59  2812    127     42     222     3     97.8       12.4
 6 2004-05-07 18:29:58  2813     74      5     220     2     96.1       12.9
 7 2004-05-07 18:29:58  2813    197    110     219     3     97.4       12.2
 8 2004-05-07 18:29:59  2813     77      3     222     3     96.3       12.4
 9 2004-05-07 18:29:59  2813    138      5     224     2     98.2       12.5
10 2004-05-07 18:29:59  2813    113    118     224     4     96.3       12.4
# ℹ 416,026 more rows
# ℹ 3 more variables: humid_adj <dbl>, hamatop <dbl>, hamabot <dbl>

Show Code

skimr::skim(redwood_all_orig)

Data summary
Name	redwood_all_orig
Number of rows	416036
Number of columns	11
_______________________
Column type frequency:
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
epoch	0	1.00	4219.64	2921.56	2.00	1625.00	3817.00	6589.00	12635.00	▇▅▅▃▁
nodeid	0	1.00	93.61	113.84	2.00	55.00	110.00	127.00	65535.00	▇▁▁▁▁
parent	0	1.00	1726.73	10196.59	0.00	42.00	118.00	140.00	65535.00	▇▁▁▁▁
voltage	0	1.00	82.73	176.41	0.01	2.65	2.75	214.00	1023.00	▇▃▁▁▁
depth	0	1.00	29.50	64.76	0.00	2.00	3.00	7.00	255.00	▇▁▁▁▁
humidity	12532	0.97	64.35	29.12	-9375.37	44.48	64.80	83.64	114.89	▁▁▁▁▇
humid_temp	12532	0.97	14.82	7.08	-38.40	10.56	14.10	17.94	603.84	▇▁▁▁▁
humid_adj	12532	0.97	62.36	25.90	-6334.83	43.88	63.02	80.43	147.69	▁▁▁▁▇
hamatop	12532	0.97	11049.06	43271.28	0.00	0.00	0.00	7250.00	22592200.00	▇▁▁▁▁
hamabot	12532	0.97	252.78	1090.41	0.00	0.00	0.00	0.00	465820.00	▇▁▁▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
result_time	0	1	2004-05-07 18:24:58	2004-11-10 14:25:00	2004-11-10 14:25:00	114981

Show Code

redwood_net_orig

# A tibble: 114,980 × 11
   result_time         epoch nodeid parent voltage depth humidity humid_temp
   <dttm>              <int>  <int>  <int>   <int> <int>    <dbl>      <dbl>
 1 2004-05-07 18:24:58  2812    119      5     220     2     95.4       12.7
 2 2004-05-07 18:24:58  2812    105    129     223     3     97.0       12.6
 3 2004-05-07 18:24:59  2812    113    118     222     4     94.5       12.5
 4 2004-05-07 18:24:59  2812    138      5     223     2     96.9       12.7
 5 2004-05-07 18:24:59  2812    127     42     222     3     97.8       12.4
 6 2004-05-07 18:29:58  2813     74      5     220     2     96.1       12.9
 7 2004-05-07 18:29:58  2813    197    110     219     3     97.4       12.2
 8 2004-05-07 18:29:59  2813     77      3     222     3     96.3       12.4
 9 2004-05-07 18:29:59  2813    138      5     224     2     98.2       12.5
10 2004-05-07 18:29:59  2813    113    118     224     4     96.3       12.4
# ℹ 114,970 more rows
# ℹ 3 more variables: humid_adj <dbl>, hamatop <dbl>, hamabot <dbl>

Show Code

skimr::skim(redwood_net_orig)

Data summary
Name	redwood_net_orig
Number of rows	114980
Number of columns	11
_______________________
Column type frequency:
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
epoch	0	1.00	6567.79	2120.35	2812.00	4751.75	6567.00	8374.00	10288.00	▇▇▇▇▇
nodeid	0	1.00	95.02	51.56	3.00	59.00	110.00	127.00	198.00	▆▇▇▇▂
parent	0	1.00	126.19	1352.45	0.00	5.00	118.00	129.00	65535.00	▇▁▁▁▁
voltage	0	1.00	292.79	227.22	198.00	218.00	223.00	227.00	1023.00	▇▁▁▁▁
depth	0	1.00	2.46	5.31	1.00	2.00	2.00	3.00	255.00	▇▁▁▁▁
humidity	4262	0.96	72.12	21.33	-4.00	57.10	72.05	92.61	114.89	▁▂▇▇▆
humid_temp	4262	0.96	14.28	9.84	6.58	10.12	12.98	16.09	122.15	▇▁▁▁▁
humid_adj	4262	0.96	69.79	20.08	-3.03	55.79	69.77	89.05	147.69	▁▅▇▅▁
hamatop	4262	0.96	11521.65	24962.82	0.00	0.00	0.00	8436.36	113376.00	▇▁▁▁▁
hamabot	4262	0.96	271.95	805.31	0.00	0.00	0.00	0.00	9480.77	▇▁▁▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
result_time	0	1	2004-05-07 18:24:58	2004-06-02 17:27:48	2004-05-20 19:21:21	114980

Show Code

redwood_log_orig

# A tibble: 301,056 × 11
   result_time         epoch nodeid parent voltage depth humidity humid_temp
   <dttm>              <int>  <int>  <int>   <dbl> <int>    <dbl>      <dbl>
 1 2004-11-10 14:25:00     3      2  65535    2.88   255     43.8       22.4
 2 2004-11-10 14:25:00     4      2  65535    2.88   255     44.8       22.2
 3 2004-11-10 14:25:00     5      2  65535    2.88   255     45.8       22.1
 4 2004-11-10 14:25:00     6      2  65535    2.88   255     46.4       22.0
 5 2004-11-10 14:25:00     7      2  65535    2.88   255     46.0       22.0
 6 2004-11-10 14:25:00     8      2  65535    2.88   255     46.3       22.0
 7 2004-11-10 14:25:00     9      2  65535    2.88   255     46.8       22.0
 8 2004-11-10 14:25:00    10      2     60    2.88     8     47.3       22.1
 9 2004-11-10 14:25:00    11      2     15    2.88     4     48.1       22.2
10 2004-11-10 14:25:00    12      2     15    2.88     4     49.0       22.3
# ℹ 301,046 more rows
# ℹ 3 more variables: humid_adj <dbl>, hamatop <dbl>, hamabot <dbl>

Show Code

skimr::skim(redwood_log_orig)

Data summary
Name	redwood_log_orig
Number of rows	301056
Number of columns	11
_______________________
Column type frequency:
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
epoch	0	1.00	3322.83	2677.35	2.00	1166.00	2420.00	5249.00	12635.00	▇▃▃▁▁
nodeid	0	1.00	93.08	129.97	2.00	49.00	105.00	127.00	65535.00	▇▁▁▁▁
parent	0	1.00	2338.01	11900.79	0.00	46.00	118.00	141.00	65535.00	▇▁▁▁▁
voltage	0	1.00	2.51	0.65	0.01	2.63	2.70	2.78	3.03	▁▁▁▁▇
depth	0	1.00	39.83	73.47	0.00	2.00	3.00	19.00	255.00	▇▁▁▁▁
humidity	8270	0.97	61.41	31.07	-9375.37	40.03	61.58	80.20	104.40	▁▁▁▁▇
humid_temp	8270	0.97	15.02	5.69	-38.40	10.86	14.72	18.81	603.84	▇▁▁▁▁
humid_adj	8270	0.97	59.55	27.27	-6334.83	39.53	60.00	77.22	100.22	▁▁▁▁▇
hamatop	8270	0.97	10870.34	48422.14	0.00	0.00	0.00	6762.33	22592200.00	▇▁▁▁▁
hamabot	8270	0.97	245.54	1180.34	0.00	0.00	0.00	0.00	465820.00	▇▁▁▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
result_time	0	1	2004-11-10 14:25:00	2004-11-10 14:25:00	2004-11-10 14:25:00	1

Intermediary Explorations

Fill in any intermediary explorations or scratch work you’d like to do here (e.g., explorations done to identify issues with the data)

Show Code

plt <- redwood_all_orig |> 
  tidyr::pivot_longer(
    cols = c(humidity, humid_temp, hamatop, hamabot, voltage)
  ) |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = epoch, y = value) +
  ggplot2::facet_grid(name ~ ., scales = "free_y") +
  ggplot2::geom_point()
plt

Clean Data

Show Code

# TODO: fill out cleaning functions in clean.R

# clean dates data
dates_df <- clean_dates_data(dates_orig)

# clean motes data
motes_df <- clean_mote_location_data(motes_orig)

# clean redwood data
redwood_all_df <- clean_redwood_data(redwood_all_orig)
redwood_net_df <- clean_redwood_data(redwood_net_orig)
redwood_log_df <- clean_redwood_data(redwood_log_orig)

Intermediary Explorations

Fill in any intermediary explorations or scratch work you’d like to do here (e.g., explorations to figure out how you can merge the data)

Merge data

Show Code

redwood_df <- merge_redwood_data(
  dates_data = dates_df,
  motes_data = motes_df,
  redwood_net_data = redwood_net_df,
  redwood_log_data = redwood_log_df
)

redwood_df |> 
  dplyr::slice_head(n = 1000) |> 
  vthemes::pretty_DT()

Show Code

skimr::skim(redwood_df)

Data summary
Name	redwood_df
Number of rows	310258
Number of columns	22
_______________________
Column type frequency:
character	5
Date	1
factor	1
numeric	12
POSIXct	2
Timespan	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
source	0	1.00	3	3	2
time_chr	0	1.00	8	8	288
date_chr	0	1.00	15	15	45
Direc	6085	0.98	1	3	9
Tree	6085	0.98	4	8	2

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
date	0	1	2004-04-27	2004-06-10	2004-05-07	45

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
day_of_week	0	1	FALSE	7	Wed: 52681, Thu: 46702, Tue: 44570, Fri: 42824

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
epoch	0	1.00	3712.24	2999.23	2.00	1236.00	2685.00	5969.75	12635.00	▇▃▃▂▁
nodeid	0	1.00	91.83	128.43	2.00	46.00	105.00	127.00	65535.00	▇▁▁▁▁
parent	0	1.00	2194.41	11527.89	0.00	44.00	118.00	140.00	65535.00	▇▁▁▁▁
voltage	0	1.00	27.96	111.66	0.01	2.64	2.71	2.80	1023.00	▇▁▁▁▁
depth	0	1.00	36.93	71.19	0.00	2.00	3.00	15.00	255.00	▇▁▁▁▁
humidity	0	1.00	61.44	30.82	-9375.37	40.13	61.25	80.57	114.89	▁▁▁▁▇
temp	0	1.00	15.37	7.55	-38.40	10.97	14.85	19.08	603.84	▇▁▁▁▁
iPAR	0	1.00	10709.38	47276.00	0.00	0.00	0.00	6614.04	22592200.00	▇▁▁▁▁
rPAR	0	1.00	231.61	1143.68	0.00	0.00	0.00	0.00	465820.00	▇▁▁▁▁
day	0	1.00	12548.89	10.41	12536.01	12540.30	12545.33	12556.73	12579.88	▇▃▃▂▁
Height	6085	0.98	47.74	12.04	10.50	40.30	49.60	56.10	66.50	▁▃▃▇▆
Dist	6085	0.98	0.99	1.47	0.10	0.10	0.10	1.00	5.00	▇▁▁▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
result_time	0	1	2004-05-07 18:24:59	2004-11-10 14:25:00	2004-11-10 14:25:00	25646
datetime	0	1	2004-04-27 17:15:00	2004-06-10 14:00:00	2004-05-07 00:50:00	12634

Variable type: Timespan

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time	0	1	0	0	0	1

Intermediary Explorations

Taking a quick look at the various variables over time…

Show Code

plt <- redwood_df |> 
  tidyr::pivot_longer(
    cols = c(humidity, temp, iPAR, rPAR, voltage)
  ) |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = epoch, y = value) +
  ggplot2::facet_grid(name ~ ., scales = "free_y") +
  ggplot2::geom_point()
plt

What if we look more closely at temperature and color points by source data?

Show Code

plt <- redwood_df |>
  ggplot2::ggplot() +
  ggplot2::aes(x = epoch, y = temp, color = source) +
  ggplot2::geom_point()
plt

Looks like most of the outliers are in the network dataset. We can even toggle between the log/net data and zoom into see what’s going on more clostly using plotly. (Try double-clicking on the red/blue points in the legend to toggle between the different data sources.)

Show Code

plotly::ggplotly(plt)

From this, it looks like the temperature is trailing off to some outrageously large temperature value. What might be going on here? Maybe some network failure? So what if we subset the data to just the network data and color the network temperature values by voltage?

Show Code

plt <- redwood_df |>
  dplyr::filter(source == "net") |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = epoch, y = temp, color = voltage) +
  ggplot2::geom_point()
plt

This might suggest that there is some network failure happening that is ultimately leading to erroneous measurements. Temperature doesn’t typically fluctuate by that much in a short period of time (recall each epoch is 5 min). Maybe we can use the voltage values in some way to help us do our data cleaning…

Note however that there are still some outliers that don’t have a voltage value. We will have to deal with those (and probably other data issues) independently.

Finally, what if we repeated some of these plots for all of the measured variables of interest?

Show Code

plt <- redwood_df |> 
  tidyr::pivot_longer(
    cols = c(humidity, temp, iPAR, rPAR)
  ) |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = epoch, y = value, color = source) +
  ggplot2::facet_grid(name ~ ., scales = "free_y") +
  ggplot2::geom_point()
plt

Restricted to just the network data and plotting by voltage…

Show Code

plt <- redwood_df |> 
  dplyr::filter(source == "net") |>
  tidyr::pivot_longer(
    cols = c(humidity, temp, iPAR, rPAR)
  ) |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = epoch, y = value, color = voltage) +
  ggplot2::facet_grid(name ~ ., scales = "free_y") +
  ggplot2::geom_point()
plt

Again, note that these aren’t the only outliers. There are also outliers in the log data, as we’ve seen above. Further investigation is needed to clean the log data and also to finish cleaning up the network data.

Show Code

plt <- redwood_df |> 
  dplyr::filter(humidity > -1000) |> 
  tidyr::pivot_longer(
    cols = c(humidity, temp, iPAR, rPAR)
  ) |> 
  ggplot2::ggplot() +
  ggplot2::aes(x = epoch, y = value, color = source) +
  ggplot2::facet_grid(name ~ ., scales = "free_y") +
  ggplot2::geom_point(size = 0.1)
plt