Error in value[[3L]](cond) : Parsing failure cleaning data

caveman · June 28, 2022, 6:30pm

I was given a script that takes raw data and parses it out into a readable format. The raw data is located in a Box folder and when the script is done it is supposed to place it in a subfolder there. There's multiple folders that are being updated with the script. The first two folders get updated, but an error occurs when it tries to update the third folder. I was thinking of editing the script to avoid the third folder to narrow the issue and see if that folder is the cause in the meantime, but I would still need to solve this I feel.

I ran the script no issues yesterday so I'm not sure why I'm getting issues now. I'm getting a parsing failure error and I don't think it has to do with authenticating into Box because there's a function that takes care of that. I got confirmation as well that no one has any of the files open in said directory. Kind of stuck here. Here's the script, if you need to see what the 'source' for the functions looks like as well let me know, thank you guys.

source("FunctionName.R")
# Set working directory on Box
setwd("C:/Users/user1/Box") 

# Authenticate Box
box_auth()

library(here)
library(furrr)
plan(multisession)

log_file = here("CompExtracts", "logs", str_c("log-", today(), ".txt"))

file.create("log_file")

logger <- logger(
  appenders = list(
    file_appender("log_file", append = T, layout = deal_log_layout()),
    console_appender(layout = deal_log_layout())
  ))


#mutate adds variables
box_input_files_list = as_tibble(box_ls(83843180925)) %>%
  select(file = name, id) %>% 
  mutate(type = str_extract(file, "[:alpha:]+(?=Comp)"),
         market = str_extract(file, "[[:alpha:]\\s]+(?=_Current)"),
         name = str_replace_all(file, c("Deal_" = "", "Comp_Extract_" = " ", "_Current\\.xlsx" = ""))) %>% 
  # Transform df to list
  as.list() %>% 
  transpose() %>% 
  # Set the names for each list element
  set_names(., map_chr(., pluck, "name"))

# Parse Extracts ---------------------------------------------------------------------------
tic("Total Deal process:")
walk(box_input_files_list, #input_files,
     # Function
     generate_from_input_files, 
     # Arguments
     testing = F,
     # prop_types = c("Housing", "Land"),
     box = T)
#notes timer and computes the elapsed time of tic
toc()

dvetsch75 · June 28, 2022, 7:00pm

Can you share the full traceback please? Do you get any information about what function is throwing the error, etc.?

caveman · June 28, 2022, 10:30pm

It looks like it's coming from DealFunctions , but from where I have no idea. I'll post the source in addition to the traceback.

> traceback()
19: stop("Parsing failure cleaning data") at DealFunctions.R#231
18: value[[3L]](cond)
17: tryCatchOne(expr, names, parentenv, handlers[[1L]])
16: tryCatchList(expr, classes, parentenv, handlers)
15: tryCatch({
        cleaned_df = clean_names(file_df) %>% mutate(across(c(deal_id, 
            square_feet, lease_term_months, ends_with("_sqft")), 
            as.numeric), square_feet = coalesce(na_if(square_feet, 
            0), na_if(building_total_sqft, 0)), across(contains("_date"), 
            ymd))  
if (historical) {
            cleaned_df[, -na.omit(match("no_of_changes", colnames(cleaned_df)))]
        }
        else {
            cols_to_remove = c("property_address_1", "property_address_2", 
                "property_city", "property_state", "property_postal_code", 
                "property_country", "acreage", "cass_address_range", 
                "cass_plus4", "cass_post_direction", "cass_pre_direction", 
                "cass_street_name", "cass_suffix", "employee_id", 
                "lma", "lob", "lob_subdivision", "market", "effective_rent_per_sqft_2", 
                "considerations_raw_data_2")
            cleaned_df[, -na.omit(match(cols_to_remove, colnames(cleaned_df)))]
        }
    }, error = function(cond) {
     ... at DealFunctions.R#212
14: basic_cleaning_box(., historical = T)
13: select(., researcher, status, new, parse_date, any_of(c("time_considered", 
        "effective_rent_per_sqft", "considerations_base_rent", colnames(current_data))))
12: rename(., Researcher = researcher, Status = status, `New?` = new)
11: mutate(., `New?` = NA, time_considered = round(time_considered, 
        3))
10: mutate(., across(everything(), as.character))
9: read_deal_data(history_file, historical = T, box = T) %>% basic_cleaning_box(historical = T) %>% 
       select(researcher, status, new, parse_date, any_of(c("time_considered", 
           "effective_rent_per_sqft", "considerations_base_rent", 
           colnames(current_data)))) %>% rename(Researcher = researcher, 
       Status = status, `New?` = new) %>% mutate(`New?` = NA, time_considered = round(time_considered, 
       3)) %>% mutate(across(everything(), as.character)) at DealFunctions.R#338
8: pull_previous_file(output_file_info, current_data = property_type_data, 
       box, logger, data_specs) at DealFunctions.R#831
7: compare_and_update(input_file = file, box = T, type = ., cleaned_extract = converted_deal_df, 
       testing = testing, column_names = original_cols, logger, 
       data_specs)
6: .f(.x[[i]], ...)
5: map(.x, .f, ...)
4: walk(prop_types, ~compare_and_update(input_file = file, box = T, 
       type = ., cleaned_extract = converted_deal_df, testing = testing, 
       column_names = original_cols, logger, data_specs)) at DealFunctions.R#913
3: .f(.x[[i]], ...)
2: map(.x, .f, ...)
1: walk(box_input_files_list, generate_from_input_files, testing = F, 
       box = T)

Not all of the functions from the source are being used in the script I don't believe. What confuses me is why the script works sometimes.

caveman · June 28, 2022, 10:37pm

## Deal Parser functions
library(tidyverse)
library(lubridate)
library(readxl)
library(janitor)
library(openxlsx)
library(tictoc)
library(log4r)
library(boxr)


deal_log_layout = function (level, ...) 
{
  function(level,
           ..., data_specifics = list()){
    args_present = data_specifics %>% #keep(list(market, transaction_type, property_type), negate(missing)) %>% 
      flatten_chr() %>% 
      str_flatten(collapse = " | ")
    
    msg <- paste0(..., collapse = "")
    
    if(args_present == "") {
      sprintf("%s [%s] %s\n", level, 
              log4r:::fmt_current_time("%Y-%m-%d %H:%M:%S"), 
              msg)
    } else {
      sprintf("%s [%s] %s\n", args_present, 
              log4r:::fmt_current_time("%Y-%m-%d %H:%M:%S"), 
              msg)
    }
  }
  
}

deal_log = function (logger, ...) 
{
  if (logger$threshold > log4r:::INFO) 
    return(invisible(NULL))
  for (appender in logger$appenders) {
    appender(...)
  }
}

deal_info = function (logger, ...) 
{
  if (logger$threshold > log4r:::INFO) 
    return(invisible(NULL))
  for (appender in logger$appenders) {
    appender(level = "INFO",...)
  }
}

rowAny <- function(x) rowSums(x) > 0

clear_dirs = function(){
  # Function to clear the testing directory
  test_dirs = list.dirs(here("../User/Documents/DealParse/data/output_test"), recursive = F)
  
  test_files = map(test_dirs, list.files, full.names = T)
  
  walk(test_files, file.remove)
}

# Extract the market name from a filename
market_from_filename = function(string){
  markets = c("ABC", "EFG", "HIJ", "KLM", "NOP", "QRS")
  
  market_string = str_c(markets, collapse = "|")
  str_extract(string, market_string)
}


filetype_from_filename = function(string){
  types = c("Lease", "Sale")
  
  types_string = str_c(types, collapse = "|")
  str_extract(string, types_string)
}

# Extract property type from filename
proptype_from_filename = function(string){
  types = c("Office", "Industrial", "Retail", "Land", "Housing", "Miscellaneous")
  
  types_string = str_c(types, collapse = "|")
  str_extract(string, types_string)
}

# Make sure the files are not open
# Read in one cell from each file
check_files_open = function(inputs){
  tic(msg = "Files open check")
  
  # generate the output filenames based on the inputs
  output_files = here("Deal CompExtracts", 
                      # Add the market folder
                      paste(market_from_filename(inputs), "Deal"), 
                      # Add the Deal_[Type]_Extract_[Market]
                      str_extract(input_files, "(?<=/)[A-z_\\s]+(?=_Current)")) %>% 
    # Add the Property type and file extension
    map(., function(filepath){
      str_c(filepath, "_", 
            c("Office","Industrial","Retail","Miscellaneous","Housing","Land"), ".xlsx")
    }) %>% 
    # Flatten this list of character vectors to one long character vector
    flatten_chr()
  browser()
  # Walk over both inputs and outputs and check
  walk(c(inputs, output_files), 
       function(filename){ 
         file_con = file(filename)
         if(isOpen(file_con)) warning("File is open:", filename)
         close(file_con)})
  toc()
}

# Create matching table between column names for humans and machines
readable_colnames = function(df){
  
  cols_to_manage =  df %>% 
    select(-any_of(c("Researcher", "Status", "New?")))
  
  tibble(
    human_names = cols_to_manage %>% colnames(),
    clean_names = cols_to_manage %>% clean_names() %>% colnames()
  )
}






# Data cleaning functions ------------------------------------------------------------------------

read_deal_data = function(file, historical = F, box = F){
  # Read in excel files, clean them, and output cleaned dataframes
  # ::param:: filename = the file to clean and read
  if(historical){
    sheet_name = "Deal Data"
    invisible()
  } else {
    sheet_name = "Data"
    if(box){
      tic(paste(file$market, "- Reading in data"))
    } else {
      tic(paste(market_from_filename(file), "- Reading in data"))
    }
    
  }
  
  ## Read in data
  file_df = tryCatch({
    if(box){
      box_read(file$id, read_fun = readxl::read_excel, sheet = sheet_name, guess_max = 5000)
    } else {
      read_excel(filename, sheet = sheet_name, guess_max = 5000)
    }
  },
  error = function(cond) {stop("Parsing failure reading in data")})
  
  if(nrow(file_df) > 100000){
    stop("Pulling in too many rows! (>100,000)")
  }
  
  if(historical){
    invisible()
  } else {
    toc()
  }
  return(file_df)
}

basic_cleaning = function(file_df, historical = F){
  # Read in excel files, clean them, and output cleaned dataframes
  # ::param:: file_df = the data to clean
  
  
  ## Clean data
  file_df_clean = tryCatch({
    clean_names(file_df) %>% 
      # convert string vars to numeric where they ought to be
      mutate(across(c(deal_id, square_feet, lease_term_months, tenant_improvement_allowance_sqft), 
                    as.numeric)) %>% 
      # replace_na(list(tenant_improvement_allowance_sqft = 0)) %>% 
      mutate(across(contains("_date"), ymd)) %>% 
      when(historical ~ select(., -any_of("no_of_changes")),
           ~ select(.,-any_of(c("property_address_1", "property_address_2", "property_city", 
                                "property_state", "property_postal_code","property_country",
                                "acreage", "cass_address_range", "cass_plus4", 
                                "cass_post_direction", "cass_pre_direction", "cass_street_name", "cass_suffix",
                                "employee_id","lma", "lob", "lob_subdivision", "market",
                                "effective_rent_per_sqft_2","considerations_raw_data_2"))))
    
  },
  error = function(cond) {stop("Parsing failure cleaning data")}
  )
  if(historical){
    invisible()
  } else {
    toc()
  }
  return(file_df_clean)
}


basic_cleaning_box = function(file_df, historical=F){
  # Read in excel files, clean them, and output cleaned dataframes
  # ::param:: filename = the file to clean and read
  ## Clean data
  tryCatch({
    cleaned_df = clean_names(file_df) %>% 
      # convert string vars to numeric where they ought to be
      mutate(across(c(deal_id, square_feet, lease_term_months, ends_with("_sqft")), 
                    as.numeric),
             square_feet = coalesce(na_if(square_feet, 0), na_if(building_total_sqft, 0)),
             across(contains("_date"), ymd))
    
    if(historical){
      cleaned_df[, -na.omit(match("no_of_changes", colnames(cleaned_df)))]
    } else {
      cols_to_remove = c("property_address_1", "property_address_2", "property_city", "property_state", "property_postal_code","property_country",
                         "acreage", 
                         "cass_address_range", "cass_plus4", "cass_post_direction", "cass_pre_direction", "cass_street_name", "cass_suffix",
                         "employee_id","lma", "lob", "lob_subdivision", "market",
                         "effective_rent_per_sqft_2","considerations_raw_data_2")
      cleaned_df[, -na.omit(match(cols_to_remove, colnames(cleaned_df)))]
    }
  },
  error = function(cond) {stop("Parsing failure cleaning data")}
  )
}

convert_considerations = function(df){
 
  considerations_level = tryCatch(
    expr = {
      df %>% 
        # Separate the considerations column and create a new row for each
        separate_rows(considerations, sep = "//") %>%
        # Count the number of considerations
        group_by(deal_id) %>% 
        mutate(cons_num = if_else(is.na(considerations), NA_character_, paste("Consideration", 1:n())),
               considerations = str_trim(considerations)) %>% 
        ungroup() %>% 
        # Order the considerations by their order of appearance
        mutate(cons_num = fct_inorder(cons_num, ordered=T)) %>% 
        arrange(deal_id, cons_num) %>% 
        ## Separate each element of the considerations:
        # Type and amount
        separate(considerations, into = c("considerations_type", "considerations_amount"), sep = "\\s\\(", remove = F) %>%
        # Length
        separate(considerations_amount, into = c("considerations_amount", "considerations_length"), sep = "\\sfor\\s")
    },
    error = function(cond) {stop("Error in conversion to considerations-level data")})
  
  # Considerations-level computations --------------------------------------------------------------
  
  considerations_computations = tryCatch(
    expr = {
      considerations_level %>%
       
        mutate(considerations_amount = as.numeric(str_extract(considerations, "(?<=\\(\\$)[0-9\\.]{1,12}")),
               time_considered = as.numeric(str_extract(considerations_length, ".+(?=\\smonth)")),
               base_rent_sqft = if_else(is.na(square_feet), 
                                        considerations_amount, 
                                        considerations_amount/square_feet),
               considerations_base = str_c("($", round(base_rent_sqft, 2), 
                                            " for ", considerations_length)) %>%

        group_by(deal_id) %>% 
        mutate(gross_rent = sum(considerations_amount*time_considered, na.rm=T),
               gross_ti = tenant_improvement_allowance_sqft*square_feet,
               effective_rent = gross_rent - gross_ti,
               effective_rent_monthly = effective_rent/sum(time_considered, na.rm=T),
               effective_rent_sqft = round(effective_rent_monthly/square_feet, 2),
               initial_base_rent = first(base_rent_sqft, order_by = cons_num)) %>% 
        ungroup()
    },
    error = function(cond) {stop("Error in consideration-level computations.")}
  )
  
  deal_level = tryCatch(
    expr = {
      
      considerations_computations %>% 
 
        select(-any_of(c("considerations", "considerations_amount", "base_rent_sqft", 
                         "considerations_length", "considerations_type",
                         "gross_rent", "gross_ti", "effective_rent", "effective_rent_monthly"))) %>% 
        pivot_wider(id_cols = c(deal_id, effective_rent_sqft, initial_base_rent), 
                    names_from = cons_num, 
                    values_from = c(considerations_base, time_considered)) %>% 
        # Calculate the time considered
        mutate(time_considered = round(rowSums(select(., contains("time_considered")), na.rm=T), 3)) %>% 
        # Remove NA column
        select(-matches("NA", ignore.case = F), -contains("time_considered_")) %>% 
        # Concatenate the considerations to the prior format, now in base rent format
        unite(considerations_base_rent, matches("Consideration"), na.rm=T, sep = " // ")
    },
    error = function(cond) {stop("Error in conversion back to deal-level data")})
  
  
  ## Ordering columns
  final_df = tryCatch(
    expr = {
      deal_level %>% 
    
        mutate(Researcher = NA,
               Status = NA,
               `New?` = NA,
               parse_date = today()) %>% 
        right_join(df, by = "deal_id") %>% 
        rename(considerations_raw_data = considerations)
    },
    error = function(cond) {stop("Error joining deal-level data back to original dataframe")})
  
  
  return(final_df)
}


# File Comparison Functions ------------------------------------------------------------------------

pull_previous_file = function(history_file, current_data, box = F, logger, data_specs){
  deal_info(logger, data_specifics = data_specs, "Reading in researcher file")
  if(box){
    read_deal_data(history_file, historical = T, box = T) %>% 
      basic_cleaning_box(historical = T) %>% 
      select(researcher, status, new, parse_date, 
             any_of(c("time_considered", "effective_rent_per_sqft", "considerations_base_rent",
                      colnames(current_data)))) %>% 
      # Rename
      rename(Researcher = researcher, Status = status, `New?` = new) %>% 
      mutate(`New?` = NA,
             time_considered = round(time_considered, 3)) %>% 
      mutate(across(everything(), 
                    as.character))
  } else {
    # Read in previous version of file we are creating, clean it
    basic_cleaning(history_file, historical = T) %>% 
      select(researcher, status, new, parse_date, 
             any_of(c("time_considered", "effective_rent_per_sqft", "considerations_base_rent",
                      colnames(current_data)))) %>% 
      # Rename
      rename(Researcher = researcher, Status = status, `New?` = new) %>% 
      mutate(`New?` = NA,
             time_considered = round(time_considered, 3)) %>% 
      mutate(across(everything(), 
                    as.character))
  }
  
}
#Keeps records which are not in previous file i.e. Deal IDs
identify_new = function(current_extract, historical_df, logger, data_specs){
  deal_info(logger, data_specifics = data_specs, "Identify new records")
  #browser()
  
  # anti_join() finds Deal IDs which are not in the historical file
  anti_join(current_extract, historical_df, by = "deal_id") %>% 
    mutate(`New?` = T,
           parse_date = today()) %>% 
    mutate(across(everything(), 
                  as.character))
}

identify_existing = function(current_extract, new_records_df, historical_df, logger, data_specs){
  deal_info(logger, data_specifics = data_specs, "Identify existing records")
  # Identify records which are already in the historical file
  existing_records_in_extract = current_extract %>% 
    anti_join(new_records_df, by = "deal_id")             # Current data which is not in new_records
  
 
  list("Current" = existing_records_in_extract,
       "Past" = historical_df) %>% 
    bind_rows(.id = "source")
}

identify_differences = function(existing_records_df, logger, data_specs){
  deal_info(logger, data_specifics = data_specs, "Identify existing records which have been updated")
  

  existing_records_df %>% 
    mutate(across(c(ends_with("_sqft"), square_feet,free_rent, base_rent, 
                    tax_amount, parking_ratio, parking_expenses_space,
                    office_area), as.numeric)) %>% 
    group_by(deal_id) %>% 
    filter(rowAny(across(c(-source, -Researcher, -parse_date, -`New?`, -Status), ~n_distinct(.) > 1))) %>% 
    ungroup() %>% 
    arrange(deal_id)
}

difference_coordinates = function(differences_df){
  differences_df %>% 
    mutate(across(everything(), 
                  as.character)) %>%    # Convert dates to character to enable pivoting
    # Pivot data to create a dataframe which has a row for each column in a Deal comp
    pivot_longer(cols = c(-source, -deal_id, -Researcher, -Status, -`New?`, -parse_date), names_to = "column", values_ptypes = list(value = "character")) %>% 
    distinct(deal_id, column, value) %>% # Keep unique combinations of column:value pairs for each Deal ID
    group_by(deal_id, column) %>% 
    filter(n() > 1) %>% # Identify rows where there is more than one observation for a column for each Deal ID
    ungroup() %>% 
    distinct(deal_id, column)
}

identify_changes = function(differences_df, logger, data_specs){
  dealinfo(logger, data_specifics = data_specs, "Identify specific changes")
  
  # Select only the updated records
  # Keep the data entry columns from the previous file
  data_entry_cols = differences_df %>% 
    filter(source == "Past") %>% 
    select(deal_id, Researcher, Status)
  
  # Select the new data from the current file
  updated_records = differences_df %>% 
    filter(source == "Current") %>% 
    select(-source, -Researcher, -Status)
  
  num_differences = difference_coordinates(differences_df) %>% 
    count(deal_id, name = "No. of changes") %>% 
    mutate(across(everything(), 
                  as.character))
  
  # Join the three together
  data_entry_cols %>% 
    left_join(updated_records, by = "deal_id") %>% 
    left_join(num_differences, by = "deal_id") %>% 
    mutate(across(everything(), 
                  as.character))
  
}

build_final_df = function(history_df, changes, new_records_df, logger, data_specs){
  deal_info(logger, data_specifics = data_specs, "Combine the researcher file with new deals and updated deals")
  
  # Select only historical records which are NOT in the dataframe of changed records
  historical_nochanges = history_df %>% 
    anti_join(changes, by = "deal_id")
  
  # Join together the unchanged existing records, the existing records with changes, and the new records
  bind_rows(historical_nochanges, changes, new_records_df) %>% 
    readr::type_convert(col_types = cols()) %>% 
    replace_na(list(`No. of changes` = 0))
}

caveman · June 28, 2022, 10:38pm

(Continued from last reply)

# Excel functions --------------------------------------------------------------------------------

highlight_new_data = function(source_data, differences_df, workbook, style_to_use){
  # Function for creating x,y coordinates of cells identified as bad data
  # ::param:: source_data = dataframe to evaluate
  # ::param:: differences_df = dataframe of differences
  # ::param:: workbook = the workbook which is being writtent o
  # ::param:: style_to_use = the style use to highlight the data
  difference_ids = difference_coordinates(differences_df)
  
  # Take the vectors of Deal IDs and column names
  walk2(difference_ids$deal_id, difference_ids$column, function(row, col){
    # Computes the row number from the Deal ID
    rownum = source_data %>% 
      rowid_to_column() %>% 
      filter(deal_id == row) %>% 
      pull(rowid)
    
    # Computes the column position
    colnum = which(colnames(source_data) == col)
    # Converts these x,y coordinates to Excel cell references
    addStyle(wb = workbook, sheet = "Deal Data", style = style_to_use, rows = rownum+1, cols = colnum, stack = T)
  }
  )
}


bad_data_coordinates = function(df){
  # Function for creating x,y coordinates of cells identified as bad data
  # ::param:: df = dataframe to evaluate
  
  df %>% 
    rowid_to_column("row") %>% 
    mutate(negative_lease_term = (lease_term_months < 0),
           blank_sqft = is.na(square_feet),
           future_sign_date = (lease_signed_date > today()),
           bad_expiration_date = (lease_expiration_date < today()|lease_expiration_date < lease_commencement_date),
           time_considered_low = ((time_considered+2) < lease_term_months),
           time_considered_high = ((time_considered-2) > lease_term_months)) %>% 
    select(row, negative_lease_term:time_considered_high) %>% 
    pivot_longer(negative_lease_term:time_considered_high, names_to = "problem") %>% 
    filter(value) %>% 
    mutate(col = case_when(str_detect(problem, "negative_lease_term") ~ which(colnames(df) == "lease_term_months"),
                           str_detect(problem, "blank_sqft") ~ which(colnames(df) == "square_feet"),
                           str_detect(problem, "future_sign_date") ~ which(colnames(df) == "lease_signed_date"),
                           str_detect(problem, "bad_expiration_date") ~ which(colnames(df) == "lease_expiration_date"),
                           str_detect(problem, "time_considered_low") ~ which(colnames(df) == "time_considered"),
                           str_detect(problem, "time_considered_high") ~ which(colnames(df) == "time_considered")),
           problem = str_replace_all(problem, c("negative_lease_term" = "Negative Lease Term",
                                                "blank_sqft" = "Missing Sqft",
                                                "future_sign_date" = "Sign date in future",
                                                "bad_expiration_date" = "Expiration date in past OR prior to commencement date",
                                                "time_considered_low" = "Lease term greater than months in considerations",
                                                "time_considered_high" = "Lease term less than months in considerations"))) %>% 
    select(row, col, problem)
}

output_to_excel = function(df, input_file, output_file, property_type, new_data, update = F, human_cols, box = T, logger, data_specs){
  # Output a dataframe to an excel file, marking bad data and any changes made to the previous file version
  # ::param:: df = the data
  # ::param:: filename = the file to write to
  # ::param:: market = the property type of the data
  # ::param:: differences = a two-column coordinate dataframe locating cells which have changed
  # ::param:: update = logical indicating whether to update an existing file
  wb_title = paste(input_file$market, property_type, input_file$type, "data")
  deal_info(logger, data_specifics = data_specs, "Preparing Excel file")

  # Change date format to string to avoid Excel date issue
  df = df %>% 
    mutate(`Research Notes` = NA) %>% 
    arrange(`New?`, desc(deal_id), desc(parse_date)) %>% 
    mutate(across(c(lease_signed_date, lease_commencement_date, lease_occupancy_date, lease_expiration_date, parse_date), 
                  as.character)) %>% 
    # Order columns
    select(Researcher, Status, `New?`, parse_date, deal_id, `Research Notes`,
           any_of("No. of changes"), is_confidential, mta_deal_id, mta_deal_status, deal_type, property_type, property_name, cass_address_plus_suite, property_floor, suite, cass_city, cass_state, cass_zip, lat, lon, region, true_market, 
           tenant_buyer:square_feet, lease_signed_date, lease_commencement_date, lease_occupancy_date, 
           lease_expiration_date, lease_term_months, time_considered, effective_rent_sqft, 
           considerations_base_rent, considerations_raw_data, 
           lease_agreement_type:future_actions, 
           contains("tenant_buyer_rep"), contains("landlord_seller_rep"),
           happy:division, everything())
  
  # Identify bad data
  # Things to highlight RED
  # - negative lease terms
  # - blank square feet
  # - sign date in the future
  # - expiration date before today OR before commencement date
  if(nrow(df)>0){
    bad_data = bad_data_coordinates(df)
  }
  
  
  # Swap column names to human-readable names
  cols_df = human_cols %>% 
    add_row(human_names = "Parse Date", clean_names = "parse_date", .before = 1)
  
  shared_cols = intersect(colnames(df), cols_df$clean_names)
  human_colnames = cols_df[match(shared_cols, cols_df$clean_names), "human_names"] %>% pull()
  
  
  # Prepping Excel file ----------------------------------------------------------------------------
  
  # 1. Create workbook
  wb = createWorkbook(creator = "Mark Barrett", 
                      title = wb_title)
  # 2. Add sheet
  addWorksheet(wb, "Deal Data", tabColour = "#006A4D")
  # 3. Formatting
  header_style = createStyle(fontColour = "white", fgFill = "#006A4D", border = "bottom", fontSize = 12, 
                             halign = "center", valign = "center", wrapText = T)
  freezePane(wb, "Deal Data", firstActiveRow = 2, firstActiveCol = 7)  
  # Styles
  confidential_style = createStyle(fontColour = "#9C0006", fgFill = "#FFC7CE")
  highlight_style = createStyle(fgFill = "yellow")
  bad_data_style = createStyle(fgFill = "red")
  dollar_style = createStyle(numFmt = "CURRENCY")
  
  colwidths = c(20,19,8,13,11,15, # Frozen columns
                16,12,14,18,32,46,30,16, # Confidential - Property Floor
                8,15,13,11,9,9,16,15, # Suite - True Market
                50,50,40,25,11,13, # Tenant/Buyer - Square feet
                21,21,21,21,13,15,15, # Signed date - Effective rent
                100,100,24, # Considerations - space type
                20,24,21,20,23,17, # TIA - Electric expenses
                18,35,35,35,35,35,80,80,50) # Future Actions - Tenant/Buyers Industry

  # 4. Write data
  df_renamed = df %>% 
    rename_with(~human_colnames, all_of(shared_cols)) %>% 
    rename(`Time Considered` = time_considered, 
           `Effective Rent per sqft` = effective_rent_sqft, 
           `Considerations (Base rent)` = considerations_base_rent,
           `Considerations (Raw data)` = considerations_raw_data)

  writeData(df_renamed, wb = wb, sheet = "Deal Data", headerStyle = header_style, withFilter = T)
  
  # 5. Resize column widths
  setColWidths(wb, sheet = "Deal Data", cols = 1:length(colwidths), widths = colwidths)
  remaining_cols = seq.int(length(colwidths)+1, ncol(df))
  colname_widths = tibble(column = colnames(df_renamed)[remaining_cols]) %>% 
    map(nchar) %>% 
    flatten_dbl()+3 # Add 3 to column widths because some are extremely small
  setColWidths(wb, sheet = "Deal Data", cols = remaining_cols, widths = colname_widths)
  
  
  
  if(nrow(df)>0){
    # Highlight confidential data
    confidential_rows = df %>% 
      rowid_to_column("row") %>% 
      filter(is_confidential == "Yes") %>% 
      pull(row)
    addStyle(wb = wb, sheet = "Deal Data", style = confidential_style, rows = confidential_rows+1, cols = 1:ncol(df), gridExpand = T, stack=T)
    
    # Describe why data is bad with comments
    pwalk(list(bad_data$row+1, bad_data$col, bad_data$problem), function(row, col, prob_text){
      problem = createComment(comment = prob_text, 
                              author = "Deal  Parser", 
                              visible = F, width = 1, height = 1)
      writeComment(wb = wb, sheet = "Deal Data", row = row, col = col, comment = problem)
    })
    # Highlight bad data
    addStyle(wb = wb, sheet = "Deal Data", style = bad_data_style, rows = bad_data$row+1, cols = bad_data$col, stack=T)
    
  }
  
  # Highlight new data
  if(update){
    # Find the cells which need to be highlighted
    highlight_new_data(source_data = df, new_data, workbook = wb, style = highlight_style)
  }
  # Highlight new columns
  # if(Sys.Date() =="2020-06-22"){
  #   new_cols = which(str_detect(colnames(df_renamed), "Derived"))
  #   walk(new_cols,
  #        function(col) addStyle(wb = wb, sheet = "Deal Data", style = highlight_style, rows = 2:nrow(df_renamed), cols = col, stack = T))
  # }
  
  # Format dollar amounts
  addStyle(wb = wb, sheet = "Deal Data", style = dollar_style, rows = 2:(nrow(df)+1), cols = which(colnames(df) == "effective_rent_per_sqft"), stack=T)
  
  
  # Data Validation --------------------------------------------------------------------------------
  # Values and colours
  dv_vals = c("In Progress", "Entered", "Verified", "Issues (wait for lease)", "Already in system")
  dv_cols = c("#FFDD00", "#00b2dd", "#00a657", "#F58220", "#69BE28")
  
  if(nrow(df)>0){
    add_data_validation(wb, df, dv_vals, dv_cols)
  }
  
  
  # Tracking Sheet ---------------------------------------------------------------------------------
  addWorksheet(wb = wb, sheetName = "Tracking", tabColour = "#69BE28")
  
  header_row = 3
  # Count of statuses
  status_tab = tibble(
    Status = dv_vals) %>% 
    mutate(Count = str_c("COUNTIF('Deal Data'!$B:$B, $A$", (header_row+1):(nrow(.)+header_row), ")")) %>% 
    add_row(
      Status = "Not entered", 
      Count = str_c(nrow(df_renamed), "-COUNTA('Deal Data'!$A:$A)"))
  
  
  # Count of statuses by researcher
  status_researcher_tab = crossing(distinct(df_renamed, Researcher), Status = dv_vals) %>% 
    filter(!is.na(Researcher)) %>% 
    when(nrow(.) > 0 ~ (.) %>% 
           mutate(Count = str_c("COUNTIFS('Deal Data'!$A:$A, $E", (header_row+1):(nrow(.)+header_row), 
                                 ", 'Deal Data'!$B:$B, $F", (header_row+1):(nrow(.)+header_row),  ")")),
         ~ (.))
  if(nrow(status_researcher_tab) > 0){
    class(status_researcher_tab$Count) <- c(class(status_researcher_tab$Count), "formula")
  }
  
  if(nrow(status_tab) > 0){
    class(status_tab$Count) <- c(class(status_tab$Count), "formula")
  }
  
  
  writeData(wb = wb, 
            x = status_tab, 
            sheet = "Tracking", 
            startCol = 1, startRow = header_row, 
            headerStyle = header_style, borders = "all")
  
  note_style = createStyle(fgFill = "yellow", textDecoration = "bold")
  writeData(wb = wb, 
            sheet = "Tracking",
            x = "Note: If researcher is not already listed below, copy the cells of another researcher (e.g. E3:G7) and change the name",
            startCol = 5,
            colNames = F)
  
  mergeCells(wb = wb, sheet = "Tracking", cols = 5:14, rows = 1)
  addStyle(wb = wb, style = note_style, sheet = "Tracking", rows = 1, cols = 5)
  
  writeData(wb = wb, 
            x = status_researcher_tab, 
            sheet = "Tracking", 
            startCol = 5, startRow = header_row, 
            headerStyle = header_style, borders = "all")
  
  setColWidths(wb = wb, sheet = "Tracking", widths = c(rep("auto", 4), 20, rep("auto", 10)), cols = 1:15)
  # Update Excel File ------------------------------------------------------------------------------
  deal_info(logger, data_specifics = data_specs, "Writing Excel file")
  if(update){
    # browser()
    box_write(wb, 
              dir_id = input_file$folder, 
              file_name = paste0(input_file$market, "_", 
                                 input_file$type, "Comp_", 
                                 property_type, "_Deal", ".xlsx"), 
              description = paste(input_file$market, property_type, input_file$type, 
                                  "data in Deal, cleaned to represent considerations and escalations in base rent terms.",
                                  "Created at", now()),
              write_fun = saveWorkbook, overwrite = T)
    deal_info(logger, data_specifics = data_specs, str_c("Updated Excel file."))
    
    # Writing a new file
  } else {
    box_write(wb, 
              dir_id = input_file$folder, 
              file_name = paste0(input_file$market, "_", 
                                 input_file$type, "Comp_", 
                                 property_type, "_Deal", ".xlsx"), 
              description = paste(input_file$market, property_type, input_file$type, 
                                  "data in Deal, cleaned to represent considerations and escalations in base rent terms.",
                                  "Created at", now()),
              write_fun = saveWorkbook, overwrite = T)
    deal_info(logger, data_specifics = data_specs, str_c("Created new Excel file."))
  }
  
  
}

add_data_validation = function(wb, df, dv_vals, dv_cols){
  
  addWorksheet(wb, "DataValidationList", visible = F)
  
  writeData(wb, sheet = 2, x = dv_vals, colNames = F)
  
  dataValidation(wb, sheet = 1, 
                 rows = 2:(nrow(df)+1),
                 cols = which(colnames(df) == "Status"),
                 type = "list",
                 value = "DataValidationList!$A$1:$A$5")
  
  # Conditional Formatting
  walk2(dv_vals, dv_cols, function(text, colour){
    cell_style = createStyle(bgFill = colour)
    conditionalFormatting(wb, sheet = 1, rows = 2:(nrow(df)+1), 
                          cols = which(colnames(df) == "Status"),
                          rule = text,
                          type = "contains",
                          style = cell_style)
  })
}





# Aggregate functions -----------------------------------------------------------------------------

compare_and_update = function(input_file, type, cleaned_extract, box = F, testing = T, column_names, 
                              logger, data_specs){
  # This function compares the cleaned extract with our previous property type file
  # pull market name
  data_specs = data_specs %>% 
    append(list(property_type = type))
  
  property_type_data = cleaned_extract %>% 
    filter(str_detect(property_type, type)) %>% 
    mutate(across(everything(), 
                  as.character))
  
  if(nrow(property_type_data) == 0){
    deal_info(logger, data_specifics = data_specs, "No relevant data in input file")
    return()
  }
  
  if(box){
    box_dir = as_tibble(box_search_folders(input_file$market, ancestor_folder_ids = 102452968963)) %>% 
      filter(str_detect(path, "test", negate = !testing))
    
    input_file$folder = box_dir$id
    
    if(nrow(box_dir) > 1){
      stop("Found multiple directories")
    }
    # Check if the directory exists, if it doesn't => create it
    if(nrow(box_dir) < 1) {
      message("Could not find a directory for ", input_file$market)
      boxr::box_dir_create(dir_name = paste(input_file$market, "Deal "), parent_dir_id = 102452968963)
      
      box_dir = as_tibble(box_search_folders(input_file$market, ancestor_folder_ids = 102452968963)) %>% 
        filter(str_detect(path, "test", negate = !testing))
      
      input_file$folder = box_dir$id
    }
    
    
    # Set output location   ----------------------------------------------------------------------
    # browser()
    output_file_info = as_tibble(box_ls(box_dir$id)) %>% 
      filter(str_detect(name, input_file$type),
             str_detect(name, !!type)) %>% 
      as.list()
  } else {
    ## - Create the output file name
    output_file_info = str_c(here("Deal CompExtracts", 
                                  paste(data_specs$market, "Deal"), 
                                  str_extract(input_file, "(?<=/)[A-z_\\s]+(?=_Current)")), "_", data_specs$property_type, ".xlsx")
    
  }
  
  # Update the historical file --------------------------------------------------------
  # Check if file exists 
  if( (length(compact(output_file_info)) > 0) ){
    # 1. Pull in previous file
    previous_file = pull_previous_file(output_file_info, current_data = property_type_data, box, 
                                       logger, data_specs)
    # 2. Identify which records from the extract are new
    new_records = identify_new(current_extract = property_type_data, historical_df = previous_file, logger, data_specs)
    # 3. Identify which records from the extract are old
    existing_records = identify_existing(current_extract = property_type_data, new_records_df = new_records, historical_df = previous_file, logger, data_specs)
    # 4. Identify differences in existing records between the extract and the previous file
    differences = identify_differences(existing_records_df = existing_records, logger, data_specs)
    record_changes = identify_changes(differences_df = differences, logger, data_specs)
    # 5. Build final dataframe
    updated_df = build_final_df(history_df = previous_file, 
                                changes = record_changes, 
                                new_records_df = new_records, logger, data_specs)
    
    
    # Output the resulting data to Excel -------------------------------------------------------------
    
    # Provided there is at least 1 row
    if(nrow(updated_df) < 1){
      invisible()
    } else {
      output_to_excel(df = updated_df, 
                      input = input_file, 
                      output = output_file_info,
                      property_type = type,
                      new_data = differences, 
                      update = T, 
                      human_cols = column_names,
                      logger = logger,
                      data_specs = data_specs)
    }
  } else {
    ## If file does not exist:
    deal_info(logger, data_specifics = data_specs, "Creating file")
    # output the file to Excel
    output_to_excel(df = property_type_data, 
                    input = input_file,
                    output = output_file_info, 
                    property_type = type,
                    human_cols = column_names,
                    logger = logger, data_specs = data_specs)
  }
}

clean_deal_files = function(deal_data){
  
  clean_deal_df = basic_cleaning_box(deal_data)
  
  # Pull the raw column names
  original_cols = readable_colnames(deal_data)
  
  # Transform the considerations columns
  deal_info(logger, "Transform considerations")
  convert_considerations(clean_deal_df) %>% 
    select(-Researcher, -Status, -`New?`)
}

generate_from_input_files = function(file, 
                                     prop_types = c("Office","Industrial","Retail","Miscellaneous","Housing","Land"), 
                                     testing = T,
                                     box = F){
  # This function performs all the actions on a raw Deal  Extract file
  if(box){
    # Print the market name
    data_specs = list(market = file$market, 
                      transaction_type = file$type)
    
    deal_info(logger, data_specifics = data_specs, paste("File: ", file$name))
  
    raw_deal_df = read_deal_data(file, box=T)
    
    clean_deal_df = basic_cleaning_box(raw_deal_df)
    
    # Pull the raw column names
    original_cols = readable_colnames(raw_deal_df)
    
    # Transform the considerations columns
    deal_info(logger, data_specifics = data_specs, "Transform considerations")
    converted_deal_df = convert_considerations(clean_deal_df)
    
    # Compare the Deal data to last week and update each property type file
    walk(prop_types, 
         ~compare_and_update(input_file = file, 
                             box = T,
                             type = ., 
                             cleaned_extract = converted_deal_df,
                             testing = testing,
                             column_names = original_cols,
                             logger,
                             data_specs))

system · July 19, 2022, 10:39pm

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.