boxplot(airquality$Ozone)
ENV222 EXERCISE
R in statistics
👉🏻Click to enter the ENV222 note section
0 Quiz 1
1.
The distribution is shown in Figure @ref(fig:box).
2.
rticles
3.
plot(1,
type = 'n',
axes = FALSE,
xlab = expression(NH[3] * " (" * mu* "mol m"^-3 * ")"),
ylab = '')
<- c('1', '2', '3')
dtf library(latex2exp)
plot(dtf, pch = 16, las = 1,
xlab = TeX('$NH_3$ ($\\mu$mol m$^{-3}$)'),
xaxt = "n", yaxt ="n",
line=0.4)
<- matrix(c(1, 2, 3, 0), nrow = 2)
mymat layout(mymat, widths = c(4, 1), heights = c(2, 1)) # Set the ratio between widths and heights
plot(iris$Sepal.Length, iris$Sepal.Width, pch=20, xlab='Sepal Length (cm)', ylab='Sepal Width (cm)', las=1)
boxplot(iris$Sepal.Length, pch=20, las=1, horizontal=T)
boxplot(iris$Sepal.Width, pch=20, las=2)
<- 'pack my box with five dozen liquor jugs'
x <- strsplit(x, split = "")[[1]]
xsingle <- xsingle[xsingle != " "]
xsingle length(xsingle)
[1] 32
<- table(xsingle)
mytab names(mytab[mytab == max(mytab)])
[1] "i" "o"
library(dplyr)
<- "pack my box with five dozen liquor jugs"
x <- gsub(" ", "", x) |>
char_freq strsplit("") |>
table() |>
data.frame() |>
arrange(desc(Freq))
head(char_freq, 5)
Var1 Freq
1 i 3
2 o 3
3 e 2
4 u 2
5 a 1
<- strsplit(x, " ")[[1]]
xword which.max(nchar(xword))] xword[
[1] "liquor"
gsub('five', '5', x)
[1] "pack my box with 5 dozen liquor jugs"
[1] "English_United States.1252"
sum(diff(mydata$date) != 1)
[1] 0
$year <- as.integer(format(mydata$date, '%Y')) mydata
$month <- format(mydata$date, '%B') mydata
$weekday <- format(mydata$date, '%A') mydata
Result:
mydata
# A tibble: 65,533 × 13
date ws wd nox no2 o3 pm10 so2 co pm25
<dttm> <dbl> <int> <int> <int> <int> <int> <dbl> <dbl> <int>
1 1998-01-01 00:00:00 0.6 280 285 39 1 29 4.72 3.37 NA
2 1998-01-01 01:00:00 2.16 230 NA NA NA 37 NA NA NA
3 1998-01-01 02:00:00 2.76 190 NA NA 3 34 6.83 9.60 NA
4 1998-01-01 03:00:00 2.16 170 493 52 3 35 7.66 10.2 NA
5 1998-01-01 04:00:00 2.4 180 468 78 2 34 8.07 8.91 NA
6 1998-01-01 05:00:00 3 190 264 42 0 16 5.50 3.05 NA
7 1998-01-01 06:00:00 3 140 171 38 0 11 4.23 2.26 NA
8 1998-01-01 07:00:00 3 170 195 51 0 12 3.88 2.00 NA
9 1998-01-01 08:00:00 3.36 170 137 42 1 12 3.35 1.46 NA
10 1998-01-01 09:00:00 3.96 170 113 39 2 12 2.92 1.20 NA
# ℹ 65,523 more rows
# ℹ 3 more variables: year <int>, month <chr>, weekday <chr>
0 Quiz 2
1.
C
2.
B
3.
Wilcoxon Rank-Sum test
4.
- H0: the medians of the PM2.5 concentration in two cities are equal.
- H1: the medians of the PM2.5 concentration in two cities are not equal.
5.
There are no duplicated values, because the rank are all int
rank(c(13.31, 13.67, 12.805, 13.1, 13.39, 12.375, 11.58, 14.58, 12.029, 13.69))
[1] 6 8 4 5 7 3 1 10 2 9
rank(c(14.14, 12.54, 15.63, 13.66, 15.51, 13.23, 13.25, 15.2, 14.18, 14.53))
[1] 5 1 10 4 9 2 3 8 6 7
6.
Test statistic value: W = 23
<- data.frame(City_A = c(13.31, 13.67, 12.805, 13.1, 13.39, 12.375, 11.58, 14.58, 12.029, 13.69),
dtf City_B = c(14.14, 12.54, 15.63, 13.66, 15.51, 13.23, 13.25, 15.2, 14.18, 14.53))
<- wilcox.test(dtf$City_A, dtf$City_B, conf.int=TRUE)
test $statistic test
W
23
7.
Critical values of 95 percent confidence interval: 24, 76
<- data.frame(conc = c(13.31, 13.67, 12.805, 13.1, 13.39, 12.375, 11.58, 14.58, 12.029, 13.69, 14.14, 12.54, 15.63, 13.66, 15.51, 13.23, 13.25, 15.2, 14.18, 14.53),
dtf city = c(rep("A", 10), rep("B", 10)))
<- sum(dtf$city == "A")
m <- sum(dtf$city == "B")
n <- qwilcox(c(0.025, 0.975), m, n)
U_critical U_critical
[1] 24 76
<- data.frame(City_A = c(13.31, 13.67, 12.805, 13.1, 13.39, 12.375, 11.58, 14.58, 12.029, 13.69),
dtf City_B = c(14.14, 12.54, 15.63, 13.66, 15.51, 13.23, 13.25, 15.2, 14.18, 14.53))
<- nrow(dtf)
m <- nrow(dtf)
n <- qwilcox(c(0.025, 0.975), m, n)
U_critical U_critical
[1] 24 76
<- 23
U <- 10
m <- 10
n curve(dwilcox(x, m, n), from = 0, to = 100, n = 101, ylab = 'Probability distribution density', las = 1, type = 'S', ylim = c(0, 0.031))
abline(h = 0)
segments(x0 = c(U, U_critical), y0 = 0, x1 = c(U, U_critical), y1 = dwilcox(c(U, U_critical), m, n), col = c('blue', 'red', 'red'))
legend('topright', legend = c('Distribution curve', 'Critical values', 'U score'), col = c('black', 'red', 'blue'), lty = 1,cex=0.7)
9.
The P-value is 0.043
<- data.frame(City_A = c(13.31, 13.67, 12.805, 13.1, 13.39, 12.375, 11.58, 14.58, 12.029, 13.69),
dtf City_B = c(14.14, 12.54, 15.63, 13.66, 15.51, 13.23, 13.25, 15.2, 14.18, 14.53))
<- wilcox.test(dtf$City_A, dtf$City_B, conf.int=TRUE)
test round(test$p.value, 3)
[1] 0.043
10.
The P-value is smaller than 0.05, reject the H0 hypotheses.
11.
The two dataset medians are different at alpha=0.05. The City_A and City_B have significant different with the median PM2.5 concentrations.
12.
The mean and standard deviation of the approximate normal distribution are 50 and 13.22876
<- m * n / 2
U_mu U_mu
[1] 50
<- sqrt(m * n * (m + n + 1) / 12)
U_sd U_sd
[1] 13.22876
13.
curve(dnorm(x, U_mu, U_sd), from=0, to=110)
1 Rmarkdown Exercise
NA
2 Character Exercise
NA
3 Time Exercise
1. Today
- Which day of the year is it?
Sys.time()
[1] "2023-05-11 12:27:12 CST"
- Convert it as “Year, Month in English and Date, Weekday in English”, such as “2023, January 1, Sunday”.
format(Sys.Date(), format = '%Y, %B, %d, %A')
[1] "2023, May, 11, Thursday"
- Suppose we get to know each other at 15:00, September 13, 2022. How many days have we known each other until now? How many hours? How many seconds?
<- Sys.time()
t1 <- strptime( "2022-9-13 15:00:00", format="%Y-%m-%d %H:%M:%S" )
t2 difftime(time1 = t1, time2 = t2, units = 'secs')
Time difference of 20726833 secs
difftime(time1 = t1, time2 = t2, units = 'days')
Time difference of 239.8939 days
difftime(time1 = t1, time2 = t2, units = 'hours')
Time difference of 5757.454 hours
- The anniversary is September 13, 2023. What day is it? How about in 2024? Plot a graph for September 13 of each year, with the weekday as x and the year as y.
<- as.Date( "9/13/2023", format="%m/%d/%Y" )
t3 format(t3, format="%A")
[1] "Wednesday"
<- as.Date( "9/13/2024", format="%m/%d/%Y" )
t4 format(t4, format="%A")
[1] "Friday"
<- seq(as.Date("2020-09-13"), as.Date("2040-09-13"), by="year")
dates <- factor(weekdays(dates, abbreviate=TRUE),
weekdays levels=c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"))
<- as.integer(format(dates, "%u"))
weekdays plot(weekdays, as.Date(dates, format="%Y" ),
xlab="Weekday", ylab="Year", main="September 13th, 2020-2040")
2. The airquality dataset
- Create a new column “Date”, showing the date in the format of “Year-month-day”, such as “2023-01-31”.
<-read.csv("data/airquality.csv")
air<- paste("2022-",air$Month,"-",air$Day,sep="")
day $Date <- as.Date(day, format="%Y-%m-%d") air
- Plot a graph for each atmospheric variable (i.e. ozone in ppb, solar radiation in W m-2, wind speed in m s-1, air temperature in °C) against the date.
<-na.omit(air)
air1<- split(air1, air1$Month)
subsets <- lapply(subsets, function(subset) {
plots plot(subset$Day, subset$Ozone,ylab = "Ozone pbb" ,main = paste("Ozone, ", unique(subset$Month))
)})par(mfrow = c(2, 3))
for (i in seq_along(plots)) {
plot(plots[[i]])
}library(ggthemes)
library(latex2exp)
<- airquality
dtf <- dtf %>% subset(Date < "1973-06-01")
dtf5
ggplot(dtf5) +
geom_point(aes(x = Date, y = Ozone))+
labs(y = "Ozone (ppb)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(dtf5) +
geom_point(aes(x = Date, y = Solar.R))+
labs(y = TeX("$\\text{Solar Radiation } (W\\cdot m^{-2})$")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(dtf5) +
geom_point(aes(x = Date, y = Wind))+
labs(y = TeX("$\\text{Wind speed }m\\cdot m^{-1}$")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(dtf5) +
geom_point(aes(x = Date, y = Temp))+
labs(y = TeX("$\\text{Temperature }\\circ C$")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
- Create a new column “Weekday”, showing the day in the week (1 as Monday and 7 as Sunday).
# install.packages("lubridate")
library(lubridate)
$Weekday <- wday(as.Date(dtf$Date), week_start = 1) dtf
- Calculate the mean values of each atmospheric variable for each weekday.
<-na.omit(air)
air1tapply(as.numeric(air1$Ozone),air1$Weekday,mean)
library(tidyverse)
<- dtf %>%
mean_dtf drop_na() %>%
group_by(Weekday) %>%
summarise(Ozone = mean(Ozone),
Solar.R = mean(Solar.R),
Wind = mean(Wind),
Temp = mean(Temp))
- Plot a graph for each atmospheric variable with the mean values against the weekdays.
library(latex2exp)
library(patchwork)
<- ggplot(mean_dtf) +
mean_ozone geom_line(aes(x = Weekday, y = Ozone)) +
labs(y = "Ozone (ppb)") +
theme_classic()
<- ggplot(mean_dtf) +
mean_solar geom_line(aes(x = Weekday, y = Solar.R)) +
labs(y = TeX("$\\text{Solar Radiation } (W\\cdot m^{-2})$")) +
theme_classic()
<- ggplot(mean_dtf) +
mean_wind geom_line(aes(x = Weekday, y = Wind))+
labs(y = TeX("$\\text{Wind speed }m\\cdot m^{-1}$")) +
theme_classic()
<- ggplot(mean_dtf) +
mean_temp geom_line(aes(x = Weekday, y = Temp))+
labs(y = TeX("$\\text{Temperature }\\circ C$")) +
theme_classic()
3. The Smart Flux data set The smartflux.zip data set was obtained from the flux observation at the Outdoor Research and Teaching Station (ORTS), XJTLU. It was supposed to be an automatic continuous measurement at a half-hourly base, saved in daily files. However, there were missing data in the records for some reason. How many data files are missing? How many records are missing?
setwd("data/smartflux/") # Remeber to delete the first file
<- list.files() %>%
temp substr(start = 1, stop =10) %>%
strptime(format = "%Y-%m-%d") %>%
format("%Y-%m-%d")
<- 0
missing_files for(i in 1:(length(temp)-1)){
<- as.vector(difftime(temp[i+1], temp[i], units = "days"))
diff if(diff != 1){
<- missing_files + diff -1
missing_files
}
}print(missing_files)
<- list.files()
temp <- lapply(temp, function(i){
files read.table(i, fill = TRUE)
})
<- 0
missing_records for(i in 1:length(files)){
<- length(files[[i]][3:nrow(files[[i]]),4])
records <- missing_records + 48 - records
missing_records
}
<- missing_records + 48 * missing_files
missing_records print(missing_records)
4 Plot Exercise
- I think the range of the colour is enough by using red to blue, purple is unnecessary which may lead the graph ugly.
- Where are the x and y labs? Although information are simple, but we still need the basic elements.
- The main problem is there do not have any unit in the y label. Further more, I think we’d better not to use this type of cover like p53-/-.
- We’d better not to use 3D plot in our essay, especially for some plot which involving proportionality.
- Those two figures can be combine together with different colour. And maybe there is one more thing that we can change the theme to blank which will make readers more easy to figure.
- Where are the x and y labs? Although information are simple, but we still need the basic elements.
library(latex2exp)
plot(iris$Sepal.Length, iris$Petal.Length, pch = 16, las = 1,
xlab = TeX('D$_p$($\\mu$m)'),
ylab = expression(frac('dN', 'dlogD'[p] * '(cm' ^-3 * ')')))
<- matrix(c(1, 2, 3, 4), nrow = 2)
mymat layout(mymat, widths = c(1, 2, 3, 2), heights = c(2, 1, 1, 2))
plot(co2)
plot(co2)
plot(co2)
plot(co2)
- There are some errors related to the figure edges, which prevent me from generating a PDF_book.
See Figure @ref(fig:figure1).
5 R Tidyverse
<- data.frame(year = c(2015, 2015, 2016, 2016),
stocks half = c( 1, 2, 1, 2),
return = c(1.88, 0.59, 0.92, 0.17))
<- stocks %>%
stocks_wide pivot_wider(names_from = half, values_from = return)
stocks_wide
# A tibble: 2 × 3
year `1` `2`
<dbl> <dbl> <dbl>
1 2015 1.88 0.59
2 2016 0.92 0.17
# because the column names 1999 and 2000 are not being correctly referenced.
%>%
table4a pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
# A tibble: 6 × 3
country year cases
<chr> <chr> <dbl>
1 Afghanistan 1999 745
2 Afghanistan 2000 2666
3 Brazil 1999 37737
4 Brazil 2000 80488
5 China 1999 212258
6 China 2000 213766
# Because there are duplicate values(columns) in the name and names columns (Not tidy)
<- tribble(
people ~name, ~names, ~values,
"Phillip Woods", "age", 45,
"Phillip Woods", "height", 186,
"Phillip Woods", "age", 50,
"Jessica Cordero", "age", 37,
"Jessica Cordero", "height", 156
)
<- tribble(
preg ~pregnant, ~male, ~female,
"yes", NA, 10,
"no", 20, 12
)<- pivot_longer(preg,c("male","female"),names_to = "Gender",values_to = "Number")
preg1 preg1
# A tibble: 4 × 3
pregnant Gender Number
<chr> <chr> <dbl>
1 yes male NA
2 yes female 10
3 no male 20
4 no female 12
5.
(1) To determine which plane (tailnum) has the worst on-time record, we can calculate the average departure delay for each plane and then sort the results in ascending order to find the plane with the highest average delay.
# install.packages("nycflights13")
library(dplyr)
library(nycflights13)
%>%
flights group_by(tailnum) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay)) %>%
head(1)
# A tibble: 1 × 2
tailnum avg_dep_delay
<chr> <dbl>
1 N844MH 297
(2) We should depart at 5am, with a smallest time of dalay of 0.688 min on average.
%>%
flights group_by(hour) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(avg_dep_delay) %>%
head(2)
# A tibble: 2 × 2
hour avg_dep_delay
<dbl> <dbl>
1 5 0.688
2 6 1.64
(3) To compute the total minutes of delay for each destination, we can group the flights by destination and calculate the total arrival and departure delays. And then we can add those two values together to get the total delay for each destination.
%>%
flights group_by(dest) %>%
summarise(total_delay = sum(arr_delay + dep_delay, na.rm = TRUE))
# A tibble: 105 × 2
dest total_delay
<chr> <dbl>
1 ABQ 4603
2 ACK 2983
3 ALB 15819
4 ANC 83
5 ATL 399462
6 AUS 45898
7 AVL 4216
8 BDL 10205
9 BGR 9732
10 BHM 12345
# ℹ 95 more rows
%>%
flights group_by(dest) %>%
mutate(total_delay = arr_delay + dep_delay) %>%
mutate(prop_delay = total_delay / sum(total_delay, na.rm = TRUE))
# A tibble: 336,776 × 21
# Groups: dest [105]
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 336,766 more rows
# ℹ 13 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, total_delay <dbl>,
# prop_delay <dbl>
(4)
(5) For each plane in each month, we can group the flights data by tailnum and month
%>%
flights group_by(tailnum, month) %>%
summarise(flights = n())
# A tibble: 37,988 × 3
# Groups: tailnum [4,044]
tailnum month flights
<chr> <int> <int>
1 D942DN 2 1
2 D942DN 3 2
3 D942DN 7 1
4 N0EGMQ 1 41
5 N0EGMQ 2 28
6 N0EGMQ 3 30
7 N0EGMQ 4 29
8 N0EGMQ 5 13
9 N0EGMQ 6 32
10 N0EGMQ 7 11
# ℹ 37,978 more rows
6 Post-hoc tests
- They have a significant different of the mean sepal length at alpha level at 0.05
library(agricolae)
<- aov(Sepal.Length ~ Species, data = iris)
iris.aov summary(iris.aov)
Df Sum Sq Mean Sq F value Pr(>F)
Species 2 63.21 31.606 119.3 <2e-16 ***
Residuals 147 38.96 0.265
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
LSD.test(iris.aov, "Species", console = TRUE)
Study: iris.aov ~ "Species"
LSD t Test for Sepal.Length
Mean Square Error: 0.2650082
Species, means and individual ( 95 %) CI
Sepal.Length std r LCL UCL Min Max
setosa 5.006 0.3524897 50 4.862126 5.149874 4.3 5.8
versicolor 5.936 0.5161711 50 5.792126 6.079874 4.9 7.0
virginica 6.588 0.6358796 50 6.444126 6.731874 4.9 7.9
Alpha: 0.05 ; DF Error: 147
Critical Value of t: 1.976233
least Significant Difference: 0.2034688
Treatments with the same letter are not significantly different.
Sepal.Length groups
virginica 6.588 a
versicolor 5.936 b
setosa 5.006 c
pairwise.t.test(iris$Sepal.Length, iris$Species, p.adjust.method = "bonferroni")
Pairwise comparisons using t tests with pooled SD
data: iris$Sepal.Length and iris$Species
setosa versicolor
versicolor 2.6e-15 -
virginica < 2e-16 8.3e-09
P value adjustment method: bonferroni
- The haematocrit measurements are the different for at least two water sources at significant level of 0.01. For details, Dushu and Control have significant difference.
<- data.frame("HTC"= c(38, 40, 32, 36, 40, 40, 38, 40, 38, 40, 36, 40, 40, 35, 45,
inflen 56, 60, 50, 50, 50, 35, 40, 40, 55, 35,
40, 42, 38, 46, 36),
"Source" = c(rep("Control",15),
rep("Dushu",10),
rep("Jinji",5)))
<- aov(HTC~Source, data = inflen)
wg_aov1 summary(wg_aov1)
Df Sum Sq Mean Sq F value Pr(>F)
Source 2 450.5 225.23 6.669 0.00443 **
Residuals 27 911.8 33.77
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(LSD.test(wg_aov1, "Source", alpha = 0.01,console = T))
Study: wg_aov1 ~ "Source"
LSD t Test for HTC
Mean Square Error: 33.7716
Source, means and individual ( 99 %) CI
HTC std r LCL UCL Min Max
Control 38.53333 2.996824 15 34.37598 42.69069 32 45
Dushu 47.10000 8.987028 10 42.00830 52.19170 35 60
Jinji 40.40000 3.847077 5 33.19925 47.60075 36 46
Alpha: 0.01 ; DF Error: 27
Critical Value of t: 2.770683
Groups according to probability of means differences and alpha level( 0.01 )
Treatments with the same letter are not significantly different.
HTC groups
Dushu 47.10000 a
Jinji 40.40000 ab
Control 38.53333 b
Length Class Mode
statistics 4 data.frame list
parameters 5 data.frame list
means 10 data.frame list
comparison 0 -none- NULL
groups 2 data.frame list
<- pairwise.t.test(inflen$HTC, inflen$Source, pool.sd = FALSE,var.equal = TRUE, p.adj = "bonferroni")
inflen_pt inflen_pt
Pairwise comparisons using t tests with non-pooled SD
data: inflen$HTC and inflen$Source
Control Dushu
Dushu 0.0066 -
Jinji 0.8227 0.4191
P value adjustment method: bonferroni
$p.value < 0.01 inflen_pt
Control Dushu
Dushu TRUE NA
Jinji FALSE FALSE
7 Non-parametric hypothesis tests
Wilcoxon Rank-Sum test: Used to compare whether the medians of two independent groups are equal. It is suitable for situations where the sample does not follow a normal distribution or where the variances are unequal.
Wilcoxon Signed-Rank test: Used to compare whether the medians of two related groups are equal. It is suitable for situations where the sample does not follow a normal distribution or where the variances are unequal.
Kruskal-Wallis test: Used to compare whether the medians of three or more independent groups are equal. It is suitable for situations where the sample does not follow a normal distribution or where the variances are unequal.
Friedman test: Used to compare whether the medians of three or more related groups are equal. It is suitable for situations where the sample does not follow a normal distribution or where the variances are unequal.
- Trichloroethylene (TCE) is reported by the EPA to be the most prevalent solvent contaminating groundwater at superfund sites in the United States. Exposure can result in effects to the immune and reproductive systems, liver, kidneys, central nervous system, and may affect fetal development during pregnancy. Long term exposures to TCE can increase the risk of kidney cancer. There is also evidence that TCE exposure can increase the risk for non-Hodgkin’s lymphoma and liver cancer.
The “tce” dataset shows the TCE concentrations (mg/L) at 10 randomly selected groundwater monitoring wells before remediation and the same wells after remediation. Download the tce.csv file. Is the median TCE concentration different after the remediation from before? Carry out a nonparametric test.
<- read.csv('data/tce.csv')
dtf wilcox.test(dtf$TCE.mg.per.L[dtf$Period == 'Before'],
$TCE.mg.per.L[dtf$Period == 'After'],
dtfpaired = TRUE, conf.int = TRUE,
correct = FALSE)
Wilcoxon signed rank exact test
data: dtf$TCE.mg.per.L[dtf$Period == "Before"] and dtf$TCE.mg.per.L[dtf$Period == "After"]
V = 55, p-value = 0.001953
alternative hypothesis: true location shift is not equal to 0
95 percent confidence interval:
6.8650 28.5715
sample estimates:
(pseudo)median
18.78
- The iris dataset. Are the median sepal lengths equal among the three species? Carry out a nonparametric test.
kruskal.test(Sepal.Length ~ Species, data = iris)
Kruskal-Wallis rank sum test
data: Sepal.Length by Species
Kruskal-Wallis chi-squared = 96.937, df = 2, p-value < 2.2e-16
boxplot(Sepal.Length ~ Species,
data = iris,
horizontal = TRUE,
notch = TRUE, las = 1)
8 Multiple Linear Regression
- The airquality dataset. Build a multiple linear regression model for predicting the ozone concentration (the dependent variable) with the solar radiation, wind speed, temperature, and month (the independent variables).
# Step 1: Visualization
<- airquality[, -6]
dtf <- na.omit(dtf)
dtf <- cor(dtf)
dtf_cor ::ggpairs(dtf) GGally
pairs(dtf)
::corrplot(dtf_cor,
corrplotorder = 'AOE',
type = "upper",
method = "number")
::corrplot(dtf_cor,
corrplotorder = "AOE",
type = "lower",
method = "ell",
diag = FALSE,
tl.pos = "l",
cl.pos = "n",
add = TRUE)
# Step 2: Maximal model
<- lm(Ozone ~ ., data = dtf)
ozone_concentration_lm summary(ozone_concentration_lm)
Call:
lm(formula = Ozone ~ ., data = dtf)
Residuals:
Min 1Q Median 3Q Max
-35.870 -13.968 -2.671 9.553 97.918
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -58.05384 22.97114 -2.527 0.0130 *
Solar.R 0.04960 0.02346 2.114 0.0368 *
Wind -3.31651 0.64579 -5.136 1.29e-06 ***
Temp 1.87087 0.27363 6.837 5.34e-10 ***
Month -2.99163 1.51592 -1.973 0.0510 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 20.9 on 106 degrees of freedom
Multiple R-squared: 0.6199, Adjusted R-squared: 0.6055
F-statistic: 43.21 on 4 and 106 DF, p-value: < 2.2e-16
confint(ozone_concentration_lm)
2.5 % 97.5 %
(Intercept) -1.035964e+02 -12.51131722
Solar.R 3.088529e-03 0.09610512
Wind -4.596849e+00 -2.03616955
Temp 1.328372e+00 2.41337586
Month -5.997092e+00 0.01383629
anova(ozone_concentration_lm)
Analysis of Variance Table
Response: Ozone
Df Sum Sq Mean Sq F value Pr(>F)
Solar.R 1 14780 14780 33.8357 6.427e-08 ***
Wind 1 39969 39969 91.5037 5.327e-16 ***
Temp 1 19050 19050 43.6117 1.639e-09 ***
Month 1 1701 1701 3.8946 0.05104 .
Residuals 106 46302 437
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Step 3: Check the Multicollinearity
::vif(dtf[, -1]) faraway
Solar.R Wind Temp Month
1.151409 1.329309 1.712455 1.256371
# Step 4: simplify the model using Backward selection
<- function(model){
get_p <- data.frame(summary(model)$coefficients)
smry2 order(-smry2$Pr...t..), ]
smry2[
}get_p(ozone_concentration_lm)
Estimate Std..Error t.value Pr...t..
Month -2.99162786 1.51592267 -1.973470 5.104492e-02
Solar.R 0.04959683 0.02345827 2.114258 3.683836e-02
(Intercept) -58.05383883 22.97114118 -2.527251 1.297198e-02
Wind -3.31650940 0.64578918 -5.135591 1.286145e-06
Temp 1.87087379 0.27363201 6.837189 5.337230e-10
<- update(ozone_concentration_lm,.~.-Month)
lm2 get_p(lm2)
Estimate Std..Error t.value Pr...t..
Solar.R 0.05982059 0.02318647 2.579979 1.123664e-02
(Intercept) -64.34207893 23.05472435 -2.790841 6.226638e-03
Wind -3.33359131 0.65440710 -5.094063 1.515934e-06
Temp 1.65209291 0.25352979 6.516366 2.423506e-09
<- step(ozone_concentration_lm, direction = "both") lm3
Start: AIC=679.71
Ozone ~ Solar.R + Wind + Temp + Month
Df Sum of Sq RSS AIC
<none> 46302 679.71
- Month 1 1701.2 48003 681.71
- Solar.R 1 1952.6 48254 682.29
- Wind 1 11520.5 57822 702.37
- Temp 1 20419.5 66721 718.26
# Step 5: report the results
summary(lm2)
Call:
lm(formula = Ozone ~ Solar.R + Wind + Temp, data = dtf)
Residuals:
Min 1Q Median 3Q Max
-40.485 -14.219 -3.551 10.097 95.619
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -64.34208 23.05472 -2.791 0.00623 **
Solar.R 0.05982 0.02319 2.580 0.01124 *
Wind -3.33359 0.65441 -5.094 1.52e-06 ***
Temp 1.65209 0.25353 6.516 2.42e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 21.18 on 107 degrees of freedom
Multiple R-squared: 0.6059, Adjusted R-squared: 0.5948
F-statistic: 54.83 on 3 and 107 DF, p-value: < 2.2e-16
::stargazer(lm2, type = 'text') stargazer
===============================================
Dependent variable:
---------------------------
Ozone
-----------------------------------------------
Solar.R 0.060**
(0.023)
Wind -3.334***
(0.654)
Temp 1.652***
(0.254)
Constant -64.342***
(23.055)
-----------------------------------------------
Observations 111
R2 0.606
Adjusted R2 0.595
Residual Std. Error 21.181 (df = 107)
F Statistic 54.834*** (df = 3; 107)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
- A scientist studies the social factors that influence heart disease. She surveys 500 towns and gather data on the percentage of people in each town who smoke, the percentage of people in each town who bike to work, and the percentage of people in each town who have heart disease. The data set is available as heart.data.csv. Perform multiple linear regression analysis.
<- read.csv("data/heart.data.csv")
dtf <- dtf[, -1]
dtf <- na.omit(dtf)
dtf <- cor(dtf)
dtf_cor ::ggpairs(dtf) GGally
::corrplot(dtf_cor,
corrplotorder = 'AOE',
type = "upper",
method = "number")
::corrplot(dtf_cor,
corrplotorder = "AOE",
type = "lower",
method = "ell",
diag = FALSE,
tl.pos = "l",
cl.pos = "n",
add = TRUE)
<- lm(heart.disease ~ ., data = dtf)
heart_disease_lm summary(heart_disease_lm)
Call:
lm(formula = heart.disease ~ ., data = dtf)
Residuals:
Min 1Q Median 3Q Max
-2.1789 -0.4463 0.0362 0.4422 1.9331
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.984658 0.080137 186.99 <2e-16 ***
biking -0.200133 0.001366 -146.53 <2e-16 ***
smoking 0.178334 0.003539 50.39 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.654 on 495 degrees of freedom
Multiple R-squared: 0.9796, Adjusted R-squared: 0.9795
F-statistic: 1.19e+04 on 2 and 495 DF, p-value: < 2.2e-16
confint(heart_disease_lm)
2.5 % 97.5 %
(Intercept) 14.8272075 15.1421084
biking -0.2028166 -0.1974495
smoking 0.1713800 0.1852878
anova(heart_disease_lm)
Analysis of Variance Table
Response: heart.disease
Df Sum Sq Mean Sq F value Pr(>F)
biking 1 9090.6 9090.6 21251.7 < 2.2e-16 ***
smoking 1 1086.0 1086.0 2538.8 < 2.2e-16 ***
Residuals 495 211.7 0.4
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
::vif(dtf[, 1:2]) faraway
biking smoking
1.000229 1.000229
<- function(model){
get_p <- data.frame(summary(model)$coefficients)
smry2 order(-smry2$Pr...t..), ]
smry2[
}get_p(heart_disease_lm)
Estimate Std..Error t.value Pr...t..
smoking 0.1783339 0.003539307 50.38667 5.192235e-197
(Intercept) 14.9846580 0.080136914 186.98821 0.000000e+00
biking -0.2001331 0.001365858 -146.52547 0.000000e+00
::stargazer(heart_disease_lm, type = 'text') stargazer
===============================================
Dependent variable:
---------------------------
heart.disease
-----------------------------------------------
biking -0.200***
(0.001)
smoking 0.178***
(0.004)
Constant 14.985***
(0.080)
-----------------------------------------------
Observations 498
R2 0.980
Adjusted R2 0.980
Residual Std. Error 0.654 (df = 495)
F Statistic 11,895.240*** (df = 2; 495)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
9 Logistic Regression
To be continue…
10 Poisson Regression
A scientist studies the effect of seizure drugs on the number of seizures during the first eight weeks of treatment. The dataset is available in the robust package:
Carry out a Poisson regression of the number of seizures after treatment (sumY) as the dependent variable, and the treatment conditions (Trt), Age, and the number of seizures before treatment (Base) as the independent variables.
data("breslow.dat", package = 'robust')
View(breslow.dat)
<- glm(sumY ~ Trt + Age + Base, data = breslow.dat, family = poisson())
fit.breslow summary(fit.breslow)
Call:
glm(formula = sumY ~ Trt + Age + Base, family = poisson(), data = breslow.dat)
Deviance Residuals:
Min 1Q Median 3Q Max
-6.0569 -2.0433 -0.9397 0.7929 11.0061
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.9488259 0.1356191 14.370 < 2e-16 ***
Trtprogabide -0.1527009 0.0478051 -3.194 0.0014 **
Age 0.0227401 0.0040240 5.651 1.59e-08 ***
Base 0.0226517 0.0005093 44.476 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 2122.73 on 58 degrees of freedom
Residual deviance: 559.44 on 55 degrees of freedom
AIC: 850.71
Number of Fisher Scoring iterations: 5
exp(coef(fit.breslow))
(Intercept) Trtprogabide Age Base
7.0204403 0.8583864 1.0230007 1.0229102
::stargazer(fit.breslow, type = 'text') stargazer
=============================================
Dependent variable:
---------------------------
sumY
---------------------------------------------
Trtprogabide -0.153***
(0.048)
Age 0.023***
(0.004)
Base 0.023***
(0.001)
Constant 1.949***
(0.136)
---------------------------------------------
Observations 59
Log Likelihood -421.354
Akaike Inf. Crit. 850.707
=============================================
Note: *p<0.1; **p<0.05; ***p<0.01
::extract_eq(fit.breslow, use_coefs = TRUE) equatiomatic
\[ \log ({ \widehat{E( \operatorname{sumY} )} }) = 1.95 - 0.15(\operatorname{Trt}_{\operatorname{progabide}}) + 0.02(\operatorname{Age}) + 0.02(\operatorname{Base}) \]
<- glm(sumY ~ Trt + Age + Base, data = breslow.dat, family = quasipoisson())
fit.breslow.od summary(fit.breslow.od)
Call:
glm(formula = sumY ~ Trt + Age + Base, family = quasipoisson(),
data = breslow.dat)
Deviance Residuals:
Min 1Q Median 3Q Max
-6.0569 -2.0433 -0.9397 0.7929 11.0061
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.948826 0.465091 4.190 0.000102 ***
Trtprogabide -0.152701 0.163943 -0.931 0.355702
Age 0.022740 0.013800 1.648 0.105085
Base 0.022652 0.001747 12.969 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for quasipoisson family taken to be 11.76075)
Null deviance: 2122.73 on 58 degrees of freedom
Residual deviance: 559.44 on 55 degrees of freedom
AIC: NA
Number of Fisher Scoring iterations: 5
exp(coef(fit.breslow.od))
(Intercept) Trtprogabide Age Base
7.0204403 0.8583864 1.0230007 1.0229102
::stargazer(fit.breslow.od, type = 'text') stargazer
========================================
Dependent variable:
---------------------------
sumY
----------------------------------------
Trtprogabide -0.153
(0.164)
Age 0.023
(0.014)
Base 0.023***
(0.002)
Constant 1.949***
(0.465)
----------------------------------------
Observations 59
========================================
Note: *p<0.1; **p<0.05; ***p<0.01
::extract_eq(fit.breslow.od, use_coefs = TRUE) equatiomatic
\[ \log ({ \widehat{E( \operatorname{sumY} )} }) = 1.95 - 0.15(\operatorname{Trt}_{\operatorname{progabide}}) + 0.02(\operatorname{Age}) + 0.02(\operatorname{Base}) \]
SessionInfo:
R version 4.2.3 (2023-03-15 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19044)
Matrix products: default
locale:
[1] LC_COLLATE=Chinese (Simplified)_China.utf8
[2] LC_CTYPE=Chinese (Simplified)_China.utf8
[3] LC_MONETARY=Chinese (Simplified)_China.utf8
[4] LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] agricolae_1.3-5 nycflights13_1.0.2 lubridate_1.9.2 forcats_1.0.0
[5] stringr_1.5.0 purrr_1.0.1 readr_2.1.4 tidyr_1.3.0
[9] tibble_3.2.1 ggplot2_3.4.2 tidyverse_2.0.0 openair_2.16-0
[13] dplyr_1.1.0 latex2exp_0.9.6
loaded via a namespace (and not attached):
[1] nlme_3.1-162 RColorBrewer_1.1-3 backports_1.4.1
[4] tools_4.2.3 utf8_1.2.3 R6_2.5.1
[7] AlgDesign_1.2.1 mgcv_1.8-42 questionr_0.7.8
[10] colorspace_2.1-0 withr_2.5.0 tidyselect_1.2.0
[13] GGally_2.1.2 klaR_1.7-2 compiler_4.2.3
[16] cli_3.6.1 labeling_0.4.2 equatiomatic_0.3.1
[19] scales_1.2.1 hexbin_1.28.3 digest_0.6.31
[22] minqa_1.2.5 rmarkdown_2.21 jpeg_0.1-10
[25] pkgconfig_2.0.3 htmltools_0.5.4 lme4_1.1-31
[28] labelled_2.11.0 fastmap_1.1.0 highr_0.10
[31] maps_3.4.1 htmlwidgets_1.6.2 rlang_1.1.0
[34] rstudioapi_0.14 shiny_1.7.4 generics_0.1.3
[37] farver_2.1.1 combinat_0.0-8 jsonlite_1.8.4
[40] magrittr_2.0.3 interp_1.1-4 Matrix_1.5-3
[43] Rcpp_1.0.10 munsell_0.5.0 fansi_1.0.3
[46] lifecycle_1.0.3 stringi_1.7.12 yaml_2.3.7
[49] MASS_7.3-58.2 plyr_1.8.8 grid_4.2.3
[52] promises_1.2.0.1 miniUI_0.1.1.1 deldir_1.0-6
[55] lattice_0.20-45 stargazer_5.2.3 haven_2.5.2
[58] splines_4.2.3 mapproj_1.2.11 hms_1.1.3
[61] knitr_1.42 pillar_1.9.0 boot_1.3-28.1
[64] glue_1.6.2 evaluate_0.20 latticeExtra_0.6-30
[67] nloptr_2.0.3 png_0.1-8 vctrs_0.6.2
[70] tzdb_0.3.0 httpuv_1.6.9 gtable_0.3.3
[73] reshape_0.8.9 xfun_0.37 mime_0.12
[76] broom_1.0.4 xtable_1.8-4 later_1.3.0
[79] faraway_1.0.8 cluster_2.1.4 corrplot_0.92
[82] timechange_0.2.0 ellipsis_0.3.2