set_colnames_remove_unk_chrx <- function(df, x){
colnames(df) <- c("Chromosome", "?", "?2", "Index", "SNP1", "SNP2")
return(df[df$Chromosome == x, ])
}
unpack <- function(l){
return(l[[1]])
}
sig_test <- function(chr, path="", save=FALSE){
options(warn=-1)
weight <- read.csv("C:\\Users\\noree\\OneDrive - The University of Chicago\\Genetics Lab-Info and RMD file\\Genetics 20187 Dog GWAS files\\Major\\weight2022.csv")
names <- weight$namegetw
#Read in raw data files named {dog_name}.tped
dfs <- lapply(paste0(names, '.tped'), read.table, stringsAsFactors=FALSE)
dfs <- lapply(dfs, set_colnames_remove_unk_chrx, chr)
names(dfs) <- names
#Bin by weight
large_names <- weight[weight$weight > 35, 'name']
small_names <- weight[weight$weight <= 35, 'name']
large_df <- dfs[large_names]
small_df <- dfs[small_names]
rm(dfs)#free up memory
small_df <- Map(function(x, i) setNames(x, ifelse(names(x) %in% c("Chromosome", "Index", "?"), names(x), sprintf('%s.%d', names(x), i))), small_df, seq_along(small_df))
large_df <- Map(function(x, i) setNames(x, ifelse(names(x) %in% c("Chromosome", "Index", "?"), names(x), sprintf('%s.%d', names(x), i))), large_df, seq_along(large_df))
small_df <- Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Chromosome", "Index", "?"), all = TRUE), small_df)
large_df <- Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Chromosome", "Index", "?"), all = TRUE), large_df)
small_df[is.na(small_df)] <- 0
large_df[is.na(large_df)] <- 0
nsample <- nrow(small_df)
small_snps2 <- list()
large_snps2 <- list()
for(i in 1:nsample){
s <- ""
for(j in 1:length(small_names)){
s <- paste0(s, small_df[i, sprintf('%s.%d', "SNP1", j)], small_df[i, sprintf('%s.%d', "SNP2", j)])
}
small_snps2 <- append(small_snps2, s)
l <- ""
for(j in 1:length(large_names)){
l <- paste0(l, large_df[i, sprintf('%s.%d', "SNP1", j)], large_df[i, sprintf('%s.%d', "SNP2", j)])
}
large_snps2 <- append(large_snps2, l)
}
r
small_splitSNPs <- lapply(small_snps2, strsplit, "")
small_splitSNPs <- lapply(small_splitSNPs, unpack)
large_splitSNPs <- lapply(large_snps2, strsplit, "")
large_splitSNPs <- lapply(large_splitSNPs, unpack)
small_single_count <- lapply(lapply(small_splitSNPs, factor, levels=c("A", "T", "C", "G")), table)
large_single_count <- lapply(lapply(large_splitSNPs, factor, levels=c("A", "T", "C", "G")), table)
write.csv(small_single_count, paste0(path, "small_single_count", chr, ".csv"), row.names = FALSE)
write.csv(large_single_count, paste0(path, "large_single_count", chr, ".csv"), row.names = FALSE)
rm(small_splitSNPs)
rm(large_splitSNPs)
p_values <- numeric(nsample)
for(i in 1:nsample){
t <- rbind(small_single_count[[i]],large_single_count[[i]])
remove <- c()
for(c in 1:ncol(t)){
if(sum(t[,c] == numeric(2)) == 2){
remove <- append(remove, c)
}
}
t<-t[,remove*-1]
if(all(is.na(t))){
p_values[i] <- NA
}else{
p_values[i] <- chisq.test(t)$p.value
}
}
return(data.frame("pvals"=p_values,"indices"=small_df$Index[1:nsample]))
}
#This first code chunk has been assigned to me. The only thing that I have altered is that I have typed the full working directory for the variable "weight"
out1 <- sig_test(1)
out2 <- sig_test(2)
out3 <- sig_test(3)
out4 <- sig_test(4)
pvals1 <- out1$pvals[-1*which(is.na(out1$pvals))]
pvals2 <- out2$pvals[-1*which(is.na(out2$pvals))]
pvals3 <- out3$pvals[-1*which(isfi.na(out3$pvals))]
pvals4 <- out3$pvals[-1*which(is.na(out4$pvals))]
indices1 <- out1$indices
indices2 <- out2$indices
indices3 <- out3$indices
indices4 <- out4$indices
# Whenever I run this chunk, I get an error message, "Error in file(file, "rt"), cannot open the connection," with the traceback as follows:
"4. file(file, "rt")
3. FUN(X[[i]], ...)
2. lapply(paste0(names, ".tped"), read.table, stringsAsFactors = FALSE)
1. sig_test(1)"