Unexpected "Null" Results

InkyMooncake · November 29, 2022, 6:09am

set_colnames_remove_unk_chrx <- function(df, x){
  colnames(df) <- c("Chromosome", "?", "?2", "Index", "SNP1", "SNP2")
  return(df[df$Chromosome == x, ])
}

unpack <- function(l){
  return(l[[1]])
}

sig_test <- function(chr, path="", save=FALSE){
  options(warn=-1)
  weight <- read.csv("weight2022.csv")
  names <- weight$name
  
  
  #Read in raw data files named {dog_name}.tped
  dfs <- lapply(paste0(names, '.tped'), read.table, stringsAsFactors=FALSE)
  dfs <- lapply(dfs, set_colnames_remove_unk_chrx, chr)
  names(dfs) <- names
  
  #Bin by weight
  large_names <- weight[weight$weight > 35, 'name']
  small_names <- weight[weight$weight <= 35, 'name']
  large_df <- dfs[large_names]
  small_df <- dfs[small_names]
  rm(dfs)#free up memory
  
  #Create unique column names for merge
  small_df <- Map(function(x, i) setNames(x, ifelse(names(x) %in% c("Chromosome", "Index", "?"), names(x), sprintf('%s.%d', names(x), i))), small_df, seq_along(small_df))
  large_df <- Map(function(x, i) setNames(x, ifelse(names(x) %in% c("Chromosome", "Index", "?"), names(x), sprintf('%s.%d', names(x), i))), large_df, seq_along(large_df))
  
  #Merge dataframes
  small_df <- Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Chromosome", "Index", "?"), all = TRUE), small_df)
  large_df <- Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = c("Chromosome", "Index", "?"), all = TRUE), large_df)
  small_df[is.na(small_df)] <- 0
  large_df[is.na(large_df)] <- 0
  
  nsample <- nrow(small_df) #assumes both dfs have same nrows
  
  #Creates list of strings for each chromosome index pair of bases
  small_snps2 <- list()
  large_snps2 <- list()
  for(i in 1:nsample){
    s <- ""
    for(j in 1:length(small_names)){
      s <- paste0(s, small_df[i, sprintf('%s.%d', "SNP1", j)], small_df[i, sprintf('%s.%d', "SNP2", j)])
    }
    small_snps2 <- append(small_snps2, s)
    
    l <- ""
    for(j in 1:length(large_names)){
      l <- paste0(l, large_df[i, sprintf('%s.%d', "SNP1", j)], large_df[i, sprintf('%s.%d', "SNP2", j)])
    }
    large_snps2 <- append(large_snps2, l)
  }
  
  #Creates character vector for each chr, ind pair
  small_splitSNPs <- lapply(small_snps2, strsplit, "")
  small_splitSNPs <- lapply(small_splitSNPs, unpack)
  large_splitSNPs <- lapply(large_snps2, strsplit, "")
  large_splitSNPs <- lapply(large_splitSNPs, unpack)
  
  #Count bases for each chr, ind pair
  small_single_count <- lapply(lapply(small_splitSNPs, factor, levels=c("A", "T", "C", "G")), table)
  large_single_count <- lapply(lapply(large_splitSNPs, factor, levels=c("A", "T", "C", "G")), table)
  write.csv(small_single_count, paste0(path, "small_single_count", chr, ".csv"), row.names = FALSE)
  write.csv(large_single_count, paste0(path, "large_single_count", chr, ".csv"), row.names = FALSE)
  rm(small_splitSNPs)
  rm(large_splitSNPs)
  
  p_values <- numeric(nsample)
  for(i in 1:nsample){
    #factor makes sure each table has same ncol
    t <- rbind(small_single_count[[i]], large_single_count[[i]])
    #Cannot have column of 0s in chisq.test
    remove <- c()
    for(c in 1:ncol(t)){
      if(sum(t[,c] == numeric(2)) == 2){
        remove <- append(remove, c)
      }
    }
    t<-t[,remove*-1]
    #When there isn't an ATCG in SNP1 or 2 for any dog
    if(all(is.na(t))){
      p_values[i] <- NA
    }else{
      p_values[i] <- chisq.test(t)$p.value
    }
  }
  return(data.frame("pvals"=p_values,"indices"=small_df$Index[1:nsample]))
}

#Code chunk 1

sig_test <- function(chr, path="", save=FALSE){
  options(warn=-1)
  weight <- read.csv("weight2022.csv")
  names <- weight$name
}

View(weight)
out1 <- sig_test(1)
pvals1 <- out1$pvals[-1*which(is.na(out1$pvals))]
indices1 <- out1$indices[-1*which(is.na(out1$pvals))]

out2 <- sig_test(2)
pvals2 <- out2$pvals[-1*which(is.na(out2$pvals))]
indices2 <- out2$indices[-1*which(is.na(out2$pvals))]

out3 <- sig_test(3)
pvals3 <- out3$pvals[-1*which(is.na(out3$pvals))]
indices3 <- out3$indices[-1*which(is.na(out3$pvals))]

out4 <- sig_test(4)
pvals4 <- out3$pvals[-1*which(is.na(out4$pvals))]
indices4 <- out4$indices[-1*which(is.na(out4$pvals))]
#The "out, pvals and indices functions all output "NULL" which is not correct. What could be the problem. For your reference, I am working on a Windows laptop

nirgrahamuk · November 29, 2022, 7:21am

Hello.
Thanks for providing code , but you could take further steps to make it more convenient for other forum users to help you.

Share some representative data that will enable your code to run and show the problematic behaviour.

You might use tools such as the library datapasta, or the base function dput() to share a portion of data in code form, i.e. that can be copied from forum and pasted to R session.

Reprex Guide

Further Advice.
If you have code that process many files, in iterative fashion, ask yourself if the repetitions are relevant to the problem, or would you see the issue with a single file ? If so your example should only concern a single file.
If your issue does not relate to file reading per se, i.e . You have no problem loading your raw data, your problem is manipulating/processing it, then you should modify your example to exclude all file loading code and substitute that code with example data that you prepared following the guide.

Good luck

system · January 10, 2023, 7:21am

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.