# 0. 查阅文档

?mean

# 1. 描述性统计

usefulR是我制作的，方便初学者使用的R包。里面储存了一些示例数据。

## check if install packages
is_inst <- function(pkg) {
nzchar(system.file(package = pkg))
}
if (!is_inst("usefulR")) { devtools::install_github("jihongz/usefulR")}
if (!is_inst("dplyr")) { install.packages("dplyr") }

library(usefulR)
library(dplyr)

## 1.2 检查数据

### 1.2.1 行数据：部分查看

head(SimpleEduData)
    item1 item2 item3
1     3     1     4
2     5     4     1
3     2     5     1
4     3     3     2
5     1     1     3
6     2     4     1
tail(SimpleEduData)
      item1 item2 item3
95      3     1     2
96      3     1     2
97      5     2     1
98      2     1     3
99      3     6     1
100     6     1     1
sample_n(SimpleEduData, 6)
     item1 item2 item3
73     1     5     1
76     3     2     6
14     3     1     4
34     6     2     6
8      6     5     5
1      3     1     4

### 1.2.2 列数据:图表查看

par(mfrow=c(2,2))

plot_histgram <- function(onecolumn) {
hist(onecolumn, breaks = (c(1:7)-0.5), col = 'lightblue', border = "darkblue")
}
plot_histgram(SimpleEduData$item1) hist(SimpleEduData$item2, breaks = (c(1:7)-0.5))
hist(SimpleEduData$item3, breaks = (c(1:7)-0.5)) 或者你也可以用ggplot2包来创造更复杂的直方图。 ## 增加density plot library(ggplot2) ggplot(data = SimpleEduData) + geom_histogram(aes(item2, y = ..density..), breaks = (c(1:7)-0.5),col = "skyblue", fill = "red3") + geom_density(aes(item2), position = "stack") plotdata <- SimpleEduData %>% gather(item, value) ## Prepare for Normal Distribution Density grid <- with(plotdata, seq(min(value), max(value), length = 100)) normaldens <- ddply(plotdata, "item", function(df) { data.frame( predicted = grid, density = dnorm(grid, mean(df$value), sd(df$value)) ) }) plotdata %>% ggplot(aes(x = value)) + geom_histogram(aes(y = ..density.., fill = item), col = "white", binwidth = 1) + ## Add some lables for each bin stat_bin(binwidth=1, geom="text", colour="white", size=3, aes(label = ..count.., y = ..density..), position = position_stack(vjust=0.5)) + ## Add density line stat_density(geom = 'line', position = "stack", alpha = 0.5, adjust = 1, linetype = 3) + scale_x_continuous("Responses", limits = c(0,7), labels = 1:6, breaks = 1:6) + scale_y_continuous("Frequency", limits = c(0,0.3), labels = 0:6/20, breaks = 0:6/20) + ## Add normal density line geom_line(aes(y = density, x = predicted), data = normaldens, colour = c("#082F45")) + facet_wrap(~item, nrow = 2) + scale_fill_brewer(palette = "Set2") + theme_light() par(mfrow=c(2,2)) boxplot(SimpleEduData$item1)
title("Item1")
boxplot(SimpleEduData$item2) title("Item2") boxplot(SimpleEduData$item3)
title("Item3")

### 1.2.3 检查缺失值

summary(SimpleEduData)
       item1          item2          item3
Min.   :1.00   Min.   :1.00   Min.   :1.00
1st Qu.:2.00   1st Qu.:2.00   1st Qu.:2.00
Median :3.00   Median :3.00   Median :3.00
Mean   :3.29   Mean   :3.29   Mean   :3.34
3rd Qu.:5.00   3rd Qu.:5.00   3rd Qu.:5.00
Max.   :6.00   Max.   :6.00   Max.   :6.00
# Add some missing values
SimpleEduData_missing <- data.frame(rbind(SimpleEduData, c(NA,NA,NA)))
tail(SimpleEduData_missing)
      item1 item2 item3
96      3     1     2
97      5     2     1
98      2     1     3
99      3     6     1
100     6     1     1
101    NA    NA    NA
summary(SimpleEduData_missing)
       item1          item2          item3
Min.   :1.00   Min.   :1.00   Min.   :1.00
1st Qu.:2.00   1st Qu.:2.00   1st Qu.:2.00
Median :3.00   Median :3.00   Median :3.00
Mean   :3.29   Mean   :3.29   Mean   :3.34
3rd Qu.:5.00   3rd Qu.:5.00   3rd Qu.:5.00
Max.   :6.00   Max.   :6.00   Max.   :6.00
NA's   :1      NA's   :1      NA's   :1

## 1.3 数据运算

### 1.3.1 求和

rowSums(SimpleEduData)
rowSums(SimpleEduData, na.rm = TRUE)

SimpleEduData %>%
mutate(total = rowSums(.)) %>%
head(10)
     item1 item2 item3 total
1      3     1     4     8
2      5     4     1    10
3      2     5     1     8
4      3     3     2     8
5      1     1     3     5
6      2     4     1     7
7      3     3     1     7
8      6     5     5    16
9      5     2     3    10
10     5     5     3    13
SimpleEduData %>%
.[1:8, ]
    item1 item2 item3
1     3     1     4
2     5     4     1
3     2     5     1
4     3     3     2
5     1     1     3
6     2     4     1
7     3     3     1
8     6     5     5
SimpleEduData %>%
mutate(id = rownames(.)) %>%
filter(item1 == 6, item2 ==1, item3 ==1)
    item1 item2 item3  id
1     6     1     1  85
2     6     1     1 100
SimpleEduData %>%
mutate(sex = c(rep(0, 50), rep(1, 50))) %>%
mutate(total = rowSums(.)) %>%
group_by(sex) %>%
summarise(Meantotalscore = mean(total), meanitem1 = mean(item1), meanitem2 = mean(item2))
    Meantotalscore meanitem1 meanitem2
1          10.42      3.29      3.29
library(dplyr)
library(usefulR)
system.time(usefulR::frequency_table(SimpleEduData))
   用户  系统  流逝
0.002 0.000 0.003
system.time(summary(SimpleEduData))
   用户  系统  流逝
0.002 0.000 0.002