pH final dataset

Overview: Script for pH data from water stations. Sources and information on data below. First I remove data points that are flagged as failed and suspect. Then I use a rolling mean to remove data points +/- 2 standard deviations from the 2hr rolling mean. Finally I calculate hourly mean since data was collected at different frequencies. This will set me up to do analysis between sites and with field data. Sites graphed from cloest to the delta to the mouth of the bay.

Notes on data:

-Accessed and downloaded all data on September 30, 2020 -Downloaded all data available from 1/1/2017-12/31/2019. -China camp http://cdmo.baruch.sc.edu/aqs/zips.cfm Data logged every 15 minutes. Requested citation format: NOAA National Estuarine Research Reserve System (NERRS). System-wide Monitoring Program. Data accessed from the NOAA NERRS Centralized Data Management Office website: http://cdmo.baruch.sc.edu/; accessed 30 September 2020. -Richardson Bay/NERR data. Data logged every 15minutes. -EOS https://oceanview.pfeg.noaa.gov/erddap/tabledap/rtcctdRTCysi.html. Data logged every 6 minutes. time in UTC. Should I use this link? http://erddap.cencoos.org/erddap/tabledap/tiburon-water-tibc1.html. ph and salinity available. No air temperature. -Point Fort http://erddap.cencoos.org/erddap/tabledap/fort-point.html?time%2Csea_water_temperature%2Csea_water_temperature_qc_agg%2Cz&time%3E%3D2020-09-20T21%3A07%3A41Z&time%3C%3D2020-09-29T12%3A00%3A00Z. time in UTC. Data logged every 6 minutes.

Next steps: -fix the aesthetics of the graphs. Font too small, change gradient, clean up keys, ect -analysis with field data -site characterization summary table

Set up

rm(list=ls())

library(tidyverse)
library(ggpubr)
library(scales)
library(chron)
library(plotly)
library(taRifx)
library(aweek)
library(easypackages)
library(renv)
library(here)
library(ggthemes)
library(gridExtra)
library(patchwork)
library(tidyquant)
library(recipes) 
library(cranlogs)
library(knitr)
library(openair)

Make a custom function to calculate mean, standard deviation (sd), 95% confidence interval where x is a numeric vector. Here “hi” and “lo” are defined as 2 strandard deviations away from mean. This creates a functin so you don’t need to change anything here/this step does not manipulate the data. You can change the equation to define the hi and low as 3 or however many sds away you would like. na.rm=TRUE removes any “NA” values. “ret” are the new columns it’ll add when you run this on a df.

custom_stat_fun_2<-function(x, na.rm=TRUE){
  m<-mean(x, na.rm=TRUE)
  s<- sd(x, na.rm=TRUE)
  hi<-m+2*s
  lo<-m-2*s
  ret<-c(mean=m, stdev=s, hi.95=hi, lo.95=lo)
}

China Camp
Read in data. Subset (not needed but it’s a large df). Need a column named “date” with no time stamp for step later.

cc<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/china_camp_all.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)

cc$date<-as.POSIXct(cc$DateTimeStamp, format = c("%m/%d/%Y"))

cc$datetime<-as.POSIXct(cc$DateTimeStamp, format = c("%m/%d/%Y %H:%M"))

cc<-subset(cc, select=c(date, datetime, pH, F_pH))

cc%>%
  ggplot(aes(x=datetime, y=pH, color=pH))+
  geom_point(alpha=0.5)+
  labs(title="pH data from China Camp: All data points",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

## Warning: Removed 847 rows containing missing values (geom_point).

Remove data that failed QC test

cc<-cc[- grep("-3", cc$F_pH),]

cc%>%
  ggplot(aes(x=date, y=pH, color=pH))+
  geom_point(alpha=0.5)+
  labs(title="pH data from China Camp: Failed QC points removed",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

## Warning: Removed 835 rows containing missing values (geom_point).

Remove data points flagged as “suspect” too

cc<-cc[- grep("1", cc$F_pH),]

cc%>%
  ggplot(aes(x=date, y=pH, color=pH))+
  geom_point(alpha=0.5)+
  labs(title="pH data from China Camp: Failed & suspect QC points removed",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

## Warning: Removed 835 rows containing missing values (geom_point).

Roll apply using custom stat function. Needs a column named “date” with no time stamp in order to work. Since data was collected every 15min, 8 observations=2hour window.

2 hour

cc<-cc%>%
  tq_mutate(
    select= pH,
    mutate_fun= rollapply,
    width= 8,
    align= "right",
    by.column= FALSE,
    FUN= custom_stat_fun_2,
    na.rm= TRUE)

## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique

Filter to remove calues that are +/- 2 standard deviations from the rolling mean

cc<-filter(cc, pH <hi.95 & pH >lo.95)

cc%>%
  ggplot(aes(x=datetime, y=pH, color=pH))+
  geom_point(alpha=0.5)+
  labs(title="pH data from China Camp: Failed QC points removed + +/- 2sd removed",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

## Warning: Removed 5 rows containing missing values (geom_point).

Hourly median
For timeAverage function to work, you need a date or POSIXct column named “date” but it needs the timestamp to work so we’ll add it back in. Interval = frequency of data collection.

cc<-subset(cc, select=-c(date))
cc$date<-as.POSIXct(cc$datetime, format = c("%m/%d/%Y %H:%M"))

cc.1hr.med<- timeAverage(cc,
            avg.time="1 hour",
            data.thresh = 0,
            statistic = "median",
            interval = "15 min",
            remove.calm = FALSE,
            type = "default")

## Warning: Missing dates detected, removing 5 lines

## Warning in checkPrep(mydata, vars, type = "default", remove.calm = FALSE, :
## Detected data with Daylight Saving Time, converting to UTC/GMT

cc.1hr.med<-subset(cc.1hr.med, select=(-c(mean, stdev, hi.95, lo.95)))

cc_ph<-cc.1hr.med %>% ggplot(aes(x=datetime, y=pH, color=pH))+
  geom_point(alpha=0.5)+ylim(6.5,9)+
  labs(title="Hourly median pH from China Camp",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

cc_ph

## Warning: Removed 14445 rows containing missing values (geom_point).

Interesting pattern but the data looks good.

Format and save

cc.1hr.med$date<-as.Date(cc.1hr.med$datetime, format = c("%Y-%m-%d"))

names(cc.1hr.med)[names(cc.1hr.med) == "pH"] <- "ph"

write.csv(cc.1hr.med, "C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/cc_ph.csv")

EOS pier

Set up

eos<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/eos_bouy_r.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)

eos$date<-as.Date(eos$date, format = c("%m/%d/%Y"))
eos$datetime<-as.POSIXct(eos$datetime, format = c("%Y-%m-%dT%H:%M:%SZ"))

eos<-subset(eos, select=c(date, datetime, ph))

eos%>%
  ggplot(aes(x=datetime, y=ph, color=ph))+
  geom_point(alpha=0.5)+
  labs(title="pH data from EOS resesarch pier: All data",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")

Since this data set doesn’t have a QC test column and I don’t see any extreme values, I’ll skip this step and move onto rolling mean. Not sure about that lower section of values during the summer of 2019.

Roll apply using custom stat function. Data collected every 6 minutes so 20 observations=2 hours. Needs a column named “date” with no time stamp in order to work.

2hr

eos<-eos%>%
  tq_mutate(
    select= ph,
    mutate_fun= rollapply,
    width= 20,
    align= "right",
    by.column= FALSE,
    FUN= custom_stat_fun_2,
    na.rm= TRUE)

## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique

Filter and graph

eos<-filter(eos, ph <hi.95 & ph >lo.95)


ggplot(data = eos, mapping = aes(x = datetime, y = ph, color=ph)) + geom_point()+
  labs(title="pH data from EOS resesarch pier: +/- 2sd data removed ",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")

Still not sure about those lower values in the summer of 2019.

Hourly median

eos<-subset(eos, select=-c(date))
names(eos)[names(eos) == "datetime"] <- "date"
eos$date<-as.POSIXct(eos$date, format = c("%Y-%m-%dT%H:%M:%SZ"))

eos.1hr.med<- timeAverage(eos,
            avg.time="1 hour",
            data.thresh = 0,
            statistic = "median",
            interval = "6 min",
            remove.calm = FALSE,
            type = "default")

## Warning in checkPrep(mydata, vars, type = "default", remove.calm = FALSE, :
## Detected data with Daylight Saving Time, converting to UTC/GMT

eos.1hr.med<-subset(eos.1hr.med, select=(-c(mean, stdev, hi.95, lo.95)))

eos_ph<-eos.1hr.med %>% ggplot(aes(x=date, y=ph, color=ph))+
  geom_point(alpha=0.5)+ylim(6.5,9)+
  labs(title="Hourly median ph from EOS resesarch pier",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")
eos_ph

## Warning: Removed 588 rows containing missing values (geom_point).

Most of those lower values in the summer of 2019 were removed but I’m not sure if these are good data. Need to verify

Format and save

names(eos.1hr.med)[names(eos.1hr.med) == "date"] <- "datetime"
eos.1hr.med$date<-as.Date(eos.1hr.med$datetime, format = c("%Y-%m-%d"))
write.csv(eos.1hr.med, "C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/eos_ph.csv")

Richardson Bay

Set up

rb<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/richardson_bay_all.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)

rb$date<-as.Date(rb$DateTimeStamp, format = c("%m/%d/%y"))
rb$datetime<-as.POSIXct(rb$DateTimeStamp, format = c("%m/%d/%y %H:%M"))

rb<-subset(rb, select=c(date, datetime, pH, F_pH, DateTimeStamp))

rb%>%
  ggplot(aes(x=datetime, y=pH, color=pH))+
  geom_point(alpha=0.5)+
  labs(title="pH data from Richardson Bay: All data",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

## Warning: Removed 1508 rows containing missing values (geom_point).

Remove flagged data that did not pass QAQC.

rb<-rb[- grep("-3", rb$F_pH),]


rb%>%
  ggplot(aes(x=datetime, y=pH, color=pH))+
  geom_point(alpha=0.5)+
  labs(title="pH data from Richardson Bay: Failed QC data removed",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

## Warning: Removed 1508 rows containing missing values (geom_point).

Similar pattern to China Camp as far as the striations go.

Remove data points flagged as “suspect”

rb<-rb[- grep("1", rb$F_pH),]

rb%>%
  ggplot(aes(x=datetime, y=pH, color=pH))+
  geom_point(alpha=0.5)+
  labs(title="pH from Richardson Bay: Failed & suspect QC data removed",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

## Warning: Removed 1508 rows containing missing values (geom_point).

Roll apply using custom stat function. Data collected every 15 min so width 2hrs=8 observations

rb<-rb%>%
  tq_mutate(
    select= pH,
    mutate_fun= rollapply,
    width= 8,
    align= "right",
    by.column= FALSE,
    FUN= custom_stat_fun_2,
    na.rm= TRUE)

## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique

Filter

rb<-filter(rb, pH <hi.95 & pH >lo.95)

rb%>%
  ggplot(aes(x=datetime, y=pH, color=pH))+
  geom_point(alpha=0.5)+
  labs(title="pH data from Richardson Bay: Failed & suspect QC data + -/+2sd removed",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

## Warning: Removed 4 rows containing missing values (geom_point).

Hourly median

rb<-subset(rb, select=-c(date))
rb$date<-as.POSIXct(rb$datetime, format = c("%m/%d/%y %H:%M"))


rb.1hr.med<- timeAverage(rb,
            avg.time="1 hour",
            data.thresh = 0,
            statistic = "median",
            interval = "15 min",
            remove.calm = FALSE,
            type = "default")

## Warning: Missing dates detected, removing 4 lines

## Warning in checkPrep(mydata, vars, type = "default", remove.calm = FALSE, :
## Detected data with Daylight Saving Time, converting to UTC/GMT

rb_ph<-rb%>%
  ggplot(aes(x=date, y=pH, color=pH))+
  geom_point(alpha=0.5)+ylim(6.5,9)+
  labs(title="Hourly median pH data from Richardson Bay",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")
rb_ph

## Warning: Removed 4 rows containing missing values (geom_point).

Looks pretty good. Again, interesting pattern.

Format and save

rb.1hr.med<-subset(rb.1hr.med, select=(-c(mean, stdev, hi.95, lo.95)))
names(rb.1hr.med)[names(rb.1hr.med) == "pH"] <- "ph"
rb.1hr.med$date<-as.Date(rb.1hr.med$date, format = c("%Y-%m-%d"))
write.csv(rb.1hr.med, "C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/rb_ph.csv")

Fort Point

No dissolved oxygen data at this station.

Graph together

all_ph<-ggarrange(cc_ph, eos_ph, rb_ph, nrow=3, ncol=1)

## Warning: Removed 14445 rows containing missing values (geom_point).

## Warning: Removed 588 rows containing missing values (geom_point).

## Warning: Removed 4 rows containing missing values (geom_point).

all_ph

pH final dataset

Chelsey Wegener

11/23/2020