set_colnames_remove_unk_chrx <- function(df, x){
colnames(df) <- c("Chromosome", "?", "?2", "Index", "SNP1", "SNP2")
return(df[df$Chromosome == x, ])
}
unpack <- function(l){
return(l[[1]])
}
sig_test <- function(chr, path="", save=FALSE){
options(warn=-1)
weight <- read.csv("weight2022.csv")
names <- weight$name
#Read in raw data files named {dog_name}.tped
dfs <- lapply(paste0(names, '.tped'), read.table, stringsAsFactors=FALSE)
dfs <- lapply(dfs, set_colnames_remove_unk_chrx, chr)
names(dfs) <- names
#Bin by weight
large_names <- weight[weight$weight > 35, 'name']
small_names <- weight[weight$weight <= 35, 'name']
large_df <- dfs[large_names]
small_df <- dfs[small_names]
rm(dfs)#free up memory
#Create unique column names for merge
small_df <- Map(function(x, i) setNames(x, ifelse(names(x) %in% c("Chromosome", "Index", "?"), names(x), sprintf('%s.%d', names(x), i))), small_df, seq_along(small_df))
large_df <- Map(function(x, i) setNames(x, ifelse(names(x) %in% c("Chromosome", "Index", "?"), names(x), sprintf('%s.%d', names(x), i))), large_df, seq_along(large_df))
#Merge dataframes
small_df <- Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Chromosome", "Index", "?"), all = TRUE), small_df)
large_df <- Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Chromosome", "Index", "?"), all = TRUE), large_df)
small_df[is.na(small_df)] <- 0
large_df[is.na(large_df)] <- 0
nsample <- nrow(small_df) #assumes both dfs have same nrows
#Creates list of strings for each chromosome index pair of bases
small_snps2 <- list()
large_snps2 <- list()
for(i in 1:nsample){
s <- ""
for(j in 1:length(small_names)){
s <- paste0(s, small_df[i, sprintf('%s.%d', "SNP1", j)], small_df[i, sprintf('%s.%d', "SNP2", j)])
}
small_snps2 <- append(small_snps2, s)
l <- ""
for(j in 1:length(large_names)){
l <- paste0(l, large_df[i, sprintf('%s.%d', "SNP1", j)], large_df[i, sprintf('%s.%d', "SNP2", j)])
}
large_snps2 <- append(large_snps2, l)
}
#Creates character vector for each chr, ind pair
small_splitSNPs <- lapply(small_snps2, strsplit, "")
small_splitSNPs <- lapply(small_splitSNPs, unpack)
large_splitSNPs <- lapply(large_snps2, strsplit, "")
large_splitSNPs <- lapply(large_splitSNPs, unpack)
#Count bases for each chr, ind pair
small_single_count <- lapply(lapply(small_splitSNPs, factor, levels=c("A", "T", "C", "G")), table)
large_single_count <- lapply(lapply(large_splitSNPs, factor, levels=c("A", "T", "C", "G")), table)
write.csv(small_single_count, paste0(path, "small_single_count", chr, ".csv"), row.names = FALSE)
write.csv(large_single_count, paste0(path, "large_single_count", chr, ".csv"), row.names = FALSE)
rm(small_splitSNPs)
rm(large_splitSNPs)
p_values <- numeric(nsample)
for(i in 1:nsample){
#factor makes sure each table has same ncol
t <- rbind(small_single_count[[i]], large_single_count[[i]])
#Cannot have column of 0s in chisq.test
remove <- c()
for(c in 1:ncol(t)){
if(sum(t[,c] == numeric(2)) == 2){
remove <- append(remove, c)
}
}
t<-t[,remove*-1]
#When there isn't an ATCG in SNP1 or 2 for any dog
if(all(is.na(t))){
p_values[i] <- NA
}else{
p_values[i] <- chisq.test(t)$p.value
}
}
return(data.frame("pvals"=p_values,"indices"=small_df$Index[1:nsample]))
}
#Code chunk 1
sig_test <- function(chr, path="", save=FALSE){
options(warn=-1)
weight <- read.csv("weight2022.csv")
names <- weight$name
}
View(weight)
out1 <- sig_test(1)
pvals1 <- out1$pvals[-1*which(is.na(out1$pvals))]
indices1 <- out1$indices[-1*which(is.na(out1$pvals))]
out2 <- sig_test(2)
pvals2 <- out2$pvals[-1*which(is.na(out2$pvals))]
indices2 <- out2$indices[-1*which(is.na(out2$pvals))]
out3 <- sig_test(3)
pvals3 <- out3$pvals[-1*which(is.na(out3$pvals))]
indices3 <- out3$indices[-1*which(is.na(out3$pvals))]
out4 <- sig_test(4)
pvals4 <- out3$pvals[-1*which(is.na(out4$pvals))]
indices4 <- out4$indices[-1*which(is.na(out4$pvals))]
#The "out, pvals and indices functions all output "NULL" which is not correct. What could be the problem. For your reference, I am working on a Windows laptop