Hello,
I'm taking a class on Data Analytics, and I turned in my code that processes a 10k line JSON file and decided that I wanted to learn R at the same time to parse through the supplied JSON file. The way the JSON is formatted is, it's a single .json file but it has 10k individual JSON objects with each object having a nested JSON array that I need to get account data from, account type, number of each account total and the sum of assets in each account type. I can tell my code works with a small data set, but when I process the 10k line file I let it run overnight and still it didn't process. I'm sure my code is just horribly inefficient, but I'm not sure how best to go about tuning it to solve for this. I'd appreciate any feedback on how to tune my code to run more efficiently. Thank you!
library("rjson")
library("dplyr")
parseJsonData <- function (sourceFile, outputFile)
{
#Get all total lines in the source file provided
parsedJson <- readLines(sourceFile)
#Get each amount for each account type
accountTypeAmounts <- GetAccountTypeAmounts(parsedJson)
#Clean up old output file
if(file.exists(outputFile)){
file.remove(outputFile)
}
#Loop over each line in the sourceFile, parse the JSON and append to DataFrame
JsonAcctData <- NULL
for(i in 1:length(parsedJson)){
jsonValue <- fromJSON(parsedJson[[i]])
frame <- data.frame(jsonValue)
JsonAcctData <- rbind(JsonAcctData, frame)
}
#Get account types
accountType <- select(JsonAcctData, "accountHistory.type")
#print(accountType)
#Get account type total count
accountTypeCount <- table(unlist(select(JsonAcctData, "accountHistory.type")))
#Take all dataframes and consolodate them into one
finalAcctData <- ConsolodateDataFrames(accountType, accountTypeAmounts, accountTypeCount)
#Write the DataFrame to the output file in CSV format
write.csv(finalAcctData, file = outputFile)
}
GetAccountTypeAmounts <- function(json) {
amount <- grep('\"amount\":\"\\d+\\.\\d+\"', json, value = TRUE)
amount <- as.numeric(gsub('.*amount\":\"(\\d+\\.+\\d+)\".*', '\\1', amount, perl = TRUE))
type <- grep('\"type\":\"\\w+\"', json, value = TRUE)
type <- gsub('.*type\":\"(\\w+)\".*', '\\1', type, perl = TRUE)
data.frame(type, amount)
}
ConsolodateDataFrames <- function(accountType, accountTypeAmounts, accountTypeCount) {
#Create consolodated dataframe
consolodatedDataFrame <- data.frame(matrix(ncol = 3, nrow = 0))
headers <- c("account_type", "the_count", "sum_amount")
colnames(consolodatedDataFrame) <- headers
types <- list("withdrawal","deposit","invoice","payment")
#Consolodate all dataframes into one
for(actType in types){
amtSum <- sum(accountTypeAmounts[accountTypeAmounts$type == actType, "amount"])
actCount <- as.character(accountTypeCount[actType][1])
newRow <- data.frame(account_type=actType, the_count=actCount, sum_amount=amtSum)
consolodatedDataFrame <- rbind(consolodatedDataFrame, newRow)
}
consolodatedDataFrame
}
Here is a small snippet of data that I was working with, the original file was 10k lines.
{"name":"Test1", "accountHistory":[{"amount":"107.62","date":"2012-02-02T06:00:00.000Z","business":"CompanyA","name":"Home Loan Account 6220","type":"payment","account":"11111111"},{"amount":"650.88","date":"2012-02-02T06:00:00.000Z","business":"CompanyF","name":"Checking Account 9001","type":"payment","account":"123123123"},{"amount":"878.63","date":"2012-02-02T06:00:00.000Z","business":"CompanyG","name":"Money Market Account 8743","type":"deposit","account":"123123123"}]}
{"name":"Test2", "accountHistory":[{"amount":"199.29","date":"2012-02-02T06:00:00.000Z","business":"CompanyB","name":"Savings Account 3580","type":"invoice","account":"12312312"},{"amount":"841.48","date":"2012-02-02T06:00:00.000Z","business":"Company","name":"Home Loan Account 5988","type":"payment","account":"123123123"},{"amount":"116.55","date":"2012-02-02T06:00:00.000Z","business":"Company","name":"Auto Loan Account 1794","type":"withdrawal","account":"12312313"}]}
{"name":"Test3", "accountHistory":[{"amount":"602.45","date":"2012-02-02T06:00:00.000Z","business":"CompanyC","name":"Personal Loan Account 9924","type":"deposit","account":"12312312"},{"amount":"377.08","date":"2012-02-02T06:00:00.000Z","business":"Company","name":"Investment Account 1002","type":"withdrawal","account":"123123123"},{"amount":"204.44","date":"2012-02-02T06:00:00.000Z","business":"Company","name":"Auto Loan Account 2452","type":"withdrawal","account":"123123123"}]}
{"name":"Test4", "accountHistory":[{"amount":"31.19","date":"2012-02-02T06:00:00.000Z","business":"CompanyD","name":"Auto Loan Account 3893","type":"payment","account":"12312312"},{"amount":"820.25","date":"2012-02-02T06:00:00.000Z","business":"Company","name":"Savings Account 6260","type":"payment","account":"123123123"},{"amount":"582.19","date":"2012-02-02T06:00:00.000Z","business":"Company","name":"Auto Loan Account 9826","type":"payment","account":"123123123"}]}
{"name":"Test5", "accountHistory":[{"amount":"340.93","date":"2012-02-02T06:00:00.000Z","business":"CompanyE","name":"Checking Account 5055","type":"withdrawal","account":"12312312"},{"amount":"523.31","date":"2012-02-02T06:00:00.000Z","business":"Company","name":"Savings Account 1680","type":"deposit","account":"123123123"},{"amount":"153.38","date":"2012-02-02T06:00:00.000Z","business":"Company","name":"Money Market Account 1010","type":"withdrawal","account":"123123123"}]}
Parsing the above JSON creates a CSV value with the below values
"","account_type","the_count","sum_amount"
"1","withdrawal","1",474.37
"2","deposit","1",878.63
"3","invoice","1",0
"4","payment","2",582.19