# wilcox 检验批处理示例

https://cloud.tencent.com/developer/article/1666764
#GRID_input_new_p10 group group
library(doBy) #使用其中的 summaryBy() 以方便按分组计算均值、中位数

#读取数据
gene <- read.table('gene.txt', sep = '\t', row.names = 1, header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
group <- read.table('group.txt', sep = '\t', header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
result <- NULL

#使用循环，逐一对各基因进行两组间 wilcox 秩和检验
for (n in 1:nrow(gene)) {
gene_n <- data.frame(t(gene[n,subset(group, group %in% c(1, 2))\$sample]))
gene_id <- names(gene_n)[1]
names(gene_n)[1] <- 'gene'

gene_n\$sample <- rownames(gene_n)
gene_n <- merge(gene_n, group, by = 'sample', all.x = TRUE)

gene_n\$group <- factor(gene_n\$group)
p_value <- wilcox.test(gene~group, gene_n)\$p.value
if (!is.na(p_value) & p_value < 0.05) {
stat <- summaryBy(gene~group, gene_n, FUN = c(mean, median))
result <- rbind(result, c(gene_id, as.character(stat[1,1]), stat[1,2], stat[1,3], as.character(stat[2,1]), stat[2,2], stat[2,3], p_value))
}
}

#输出统计结果
result <- data.frame(result)
names(result) <- c('gene_id', 'group1', 'mean1', 'median1', 'group2', 'mean2', 'median2', 'p_value')
write.table(result, 'gene.wilcox.txt', sep = '\t', row.names = FALSE, quote = FALSE)

> gene <- read.table('GRID_input_new_p10.csv', sep = '\t', row.names = 1, header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
> group <- read.table('group.csv', sep = ',', header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
> gene_2 <- data.frame(t(gene[2,subset(group, group %in% c(1, 2))\$sample]))
> gene_2
[1] Micrococcus.aloeverae
<0 rows> (or 0-length row.names)
> gene_id <- names(gene_2)[1]
> names(gene_2)[1] <- 'gene'
> rownames(gene_2)
character(0)
> for (n in 1:nrow(gene)) {
+ gene_n <- data.frame(t(gene[n,subset(group, group %in% c(1, 2))\$sample]))
+ gene_id <- names(gene_n)[1]
+ names(gene_n)[1] <- 'gene'
+
+ gene_n\$sample <- rownames(gene_n)
+ gene_n <- merge(gene_n, group, by = 'sample', all.x = TRUE)
+
+ gene_n\$group <- factor(gene_n\$group)
+ p_value <- wilcox.test(gene~group, gene_n)\$p.value
+ if (!is.na(p_value) & p_value < 0.05) {
+ stat <- summaryBy(gene~group, gene_n, FUN = c(mean, median))
+ result <- rbind(result, c(gene_id, as.character(stat[1,1]), stat[1,2], stat[1,3], as.character(stat[2,1]), stat[2,2], stat[2,3], p_value))
+ }
+ }
Error in wilcox.test.formula(gene ~ group, gene_n) :
grouping factor must have exactly 2 levels
> group <- read.table('group.csv', sep = ',', header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
> group
sample group
1 LOR002C 1
2 LOR003C 1
3 LOR005C 1
4 LOR006C 1
5 LOR007C 1
6 LOR008C 1
7 LOR009C 1
8 LOR011C 1
9 LOR012C 1
10 LOR013C 1
11 LOR014C 1
12 LOR015C 1
13 LOR016C 1
14 LOR017C 1
15 LOR018C 1
16 LOR019C 1
17 LOR020C 1
18 LOR021C 1
19 LOR022C 1
20 LOR023C 1
21 LOR024C 1

> for (n in 1:nrow(gene)) {
+ gene_n <- data.frame(t(gene[n,subset(group, group %in% c(1, 2))\$sample]))
+ gene_id <- names(gene_n)[1]
+ names(gene_n)[1] <- 'gene'
+
+ gene_n\$sample <- rownames(gene_n)
+ gene_n <- merge(gene_n, group, by = 'sample', all.x = TRUE)
+
+ gene_n\$group <- factor(gene_n\$group)
+ p_value <- wilcox.test(gene~group, gene_n)\$p.value
+ if (!is.na(p_value)){
+ stat <- summaryBy(gene~group, gene_n, FUN = c(mean, median))
+ result <- rbind(result, c(gene_id, as.character(stat[1,1]), stat[1,2], stat[1,3], as.character(stat[2,1]), stat[2,2], stat[2,3], p_value))
+ }
+ }
> result <- data.frame(result)
> names(result) <- c('gene_id', 'group1', 'mean1', 'median1', 'group2', 'mean2', 'median2', 'p_value')
> write.table(result, 'gene.wilcox.all.csv', sep = '\t', row.names = FALSE, quote = FALSE)
> for (n in 1:nrow(gene)) {
+ gene_n <- data.frame(t(gene[n,subset(group, group %in% c(1, 2))\$sample]))
+ gene_id <- names(gene_n)[1]
+ names(gene_n)[1] <- 'gene'
+
+ gene_n\$sample <- rownames(gene_n)
+ gene_n <- merge(gene_n, group, by = 'sample', all.x = TRUE)
+
+ gene_n\$group <- factor(gene_n\$group)
+ p_value <- wilcox.test(gene~group, gene_n)\$p.value
+ if (!is.na(p_value) & p_value < 0.5) {
+ stat <- summaryBy(gene~group, gene_n, FUN = c(mean, median))
+ result <- rbind(result, c(gene_id, as.character(stat[1,1]), stat[1,2], stat[1,3], as.character(stat[2,1]), stat[2,2], stat[2,3], p_value))
+ }
+ }
> result <- data.frame(result)
> names(result) <- c('gene_id', 'group1', 'mean1', 'median1', 'group2', 'mean2', 'median2', 'p_value')