(Continued from last reply)
# Excel functions --------------------------------------------------------------------------------
highlight_new_data = function(source_data, differences_df, workbook, style_to_use){
# Function for creating x,y coordinates of cells identified as bad data
# ::param:: source_data = dataframe to evaluate
# ::param:: differences_df = dataframe of differences
# ::param:: workbook = the workbook which is being writtent o
# ::param:: style_to_use = the style use to highlight the data
difference_ids = difference_coordinates(differences_df)
# Take the vectors of Deal IDs and column names
walk2(difference_ids$deal_id, difference_ids$column, function(row, col){
# Computes the row number from the Deal ID
rownum = source_data %>%
rowid_to_column() %>%
filter(deal_id == row) %>%
pull(rowid)
# Computes the column position
colnum = which(colnames(source_data) == col)
# Converts these x,y coordinates to Excel cell references
addStyle(wb = workbook, sheet = "Deal Data", style = style_to_use, rows = rownum+1, cols = colnum, stack = T)
}
)
}
bad_data_coordinates = function(df){
# Function for creating x,y coordinates of cells identified as bad data
# ::param:: df = dataframe to evaluate
df %>%
rowid_to_column("row") %>%
mutate(negative_lease_term = (lease_term_months < 0),
blank_sqft = is.na(square_feet),
future_sign_date = (lease_signed_date > today()),
bad_expiration_date = (lease_expiration_date < today()|lease_expiration_date < lease_commencement_date),
time_considered_low = ((time_considered+2) < lease_term_months),
time_considered_high = ((time_considered-2) > lease_term_months)) %>%
select(row, negative_lease_term:time_considered_high) %>%
pivot_longer(negative_lease_term:time_considered_high, names_to = "problem") %>%
filter(value) %>%
mutate(col = case_when(str_detect(problem, "negative_lease_term") ~ which(colnames(df) == "lease_term_months"),
str_detect(problem, "blank_sqft") ~ which(colnames(df) == "square_feet"),
str_detect(problem, "future_sign_date") ~ which(colnames(df) == "lease_signed_date"),
str_detect(problem, "bad_expiration_date") ~ which(colnames(df) == "lease_expiration_date"),
str_detect(problem, "time_considered_low") ~ which(colnames(df) == "time_considered"),
str_detect(problem, "time_considered_high") ~ which(colnames(df) == "time_considered")),
problem = str_replace_all(problem, c("negative_lease_term" = "Negative Lease Term",
"blank_sqft" = "Missing Sqft",
"future_sign_date" = "Sign date in future",
"bad_expiration_date" = "Expiration date in past OR prior to commencement date",
"time_considered_low" = "Lease term greater than months in considerations",
"time_considered_high" = "Lease term less than months in considerations"))) %>%
select(row, col, problem)
}
output_to_excel = function(df, input_file, output_file, property_type, new_data, update = F, human_cols, box = T, logger, data_specs){
# Output a dataframe to an excel file, marking bad data and any changes made to the previous file version
# ::param:: df = the data
# ::param:: filename = the file to write to
# ::param:: market = the property type of the data
# ::param:: differences = a two-column coordinate dataframe locating cells which have changed
# ::param:: update = logical indicating whether to update an existing file
wb_title = paste(input_file$market, property_type, input_file$type, "data")
deal_info(logger, data_specifics = data_specs, "Preparing Excel file")
# Change date format to string to avoid Excel date issue
df = df %>%
mutate(`Research Notes` = NA) %>%
arrange(`New?`, desc(deal_id), desc(parse_date)) %>%
mutate(across(c(lease_signed_date, lease_commencement_date, lease_occupancy_date, lease_expiration_date, parse_date),
as.character)) %>%
# Order columns
select(Researcher, Status, `New?`, parse_date, deal_id, `Research Notes`,
any_of("No. of changes"), is_confidential, mta_deal_id, mta_deal_status, deal_type, property_type, property_name, cass_address_plus_suite, property_floor, suite, cass_city, cass_state, cass_zip, lat, lon, region, true_market,
tenant_buyer:square_feet, lease_signed_date, lease_commencement_date, lease_occupancy_date,
lease_expiration_date, lease_term_months, time_considered, effective_rent_sqft,
considerations_base_rent, considerations_raw_data,
lease_agreement_type:future_actions,
contains("tenant_buyer_rep"), contains("landlord_seller_rep"),
happy:division, everything())
# Identify bad data
# Things to highlight RED
# - negative lease terms
# - blank square feet
# - sign date in the future
# - expiration date before today OR before commencement date
if(nrow(df)>0){
bad_data = bad_data_coordinates(df)
}
# Swap column names to human-readable names
cols_df = human_cols %>%
add_row(human_names = "Parse Date", clean_names = "parse_date", .before = 1)
shared_cols = intersect(colnames(df), cols_df$clean_names)
human_colnames = cols_df[match(shared_cols, cols_df$clean_names), "human_names"] %>% pull()
# Prepping Excel file ----------------------------------------------------------------------------
# 1. Create workbook
wb = createWorkbook(creator = "Mark Barrett",
title = wb_title)
# 2. Add sheet
addWorksheet(wb, "Deal Data", tabColour = "#006A4D")
# 3. Formatting
header_style = createStyle(fontColour = "white", fgFill = "#006A4D", border = "bottom", fontSize = 12,
halign = "center", valign = "center", wrapText = T)
freezePane(wb, "Deal Data", firstActiveRow = 2, firstActiveCol = 7)
# Styles
confidential_style = createStyle(fontColour = "#9C0006", fgFill = "#FFC7CE")
highlight_style = createStyle(fgFill = "yellow")
bad_data_style = createStyle(fgFill = "red")
dollar_style = createStyle(numFmt = "CURRENCY")
colwidths = c(20,19,8,13,11,15, # Frozen columns
16,12,14,18,32,46,30,16, # Confidential - Property Floor
8,15,13,11,9,9,16,15, # Suite - True Market
50,50,40,25,11,13, # Tenant/Buyer - Square feet
21,21,21,21,13,15,15, # Signed date - Effective rent
100,100,24, # Considerations - space type
20,24,21,20,23,17, # TIA - Electric expenses
18,35,35,35,35,35,80,80,50) # Future Actions - Tenant/Buyers Industry
# 4. Write data
df_renamed = df %>%
rename_with(~human_colnames, all_of(shared_cols)) %>%
rename(`Time Considered` = time_considered,
`Effective Rent per sqft` = effective_rent_sqft,
`Considerations (Base rent)` = considerations_base_rent,
`Considerations (Raw data)` = considerations_raw_data)
writeData(df_renamed, wb = wb, sheet = "Deal Data", headerStyle = header_style, withFilter = T)
# 5. Resize column widths
setColWidths(wb, sheet = "Deal Data", cols = 1:length(colwidths), widths = colwidths)
remaining_cols = seq.int(length(colwidths)+1, ncol(df))
colname_widths = tibble(column = colnames(df_renamed)[remaining_cols]) %>%
map(nchar) %>%
flatten_dbl()+3 # Add 3 to column widths because some are extremely small
setColWidths(wb, sheet = "Deal Data", cols = remaining_cols, widths = colname_widths)
if(nrow(df)>0){
# Highlight confidential data
confidential_rows = df %>%
rowid_to_column("row") %>%
filter(is_confidential == "Yes") %>%
pull(row)
addStyle(wb = wb, sheet = "Deal Data", style = confidential_style, rows = confidential_rows+1, cols = 1:ncol(df), gridExpand = T, stack=T)
# Describe why data is bad with comments
pwalk(list(bad_data$row+1, bad_data$col, bad_data$problem), function(row, col, prob_text){
problem = createComment(comment = prob_text,
author = "Deal Parser",
visible = F, width = 1, height = 1)
writeComment(wb = wb, sheet = "Deal Data", row = row, col = col, comment = problem)
})
# Highlight bad data
addStyle(wb = wb, sheet = "Deal Data", style = bad_data_style, rows = bad_data$row+1, cols = bad_data$col, stack=T)
}
# Highlight new data
if(update){
# Find the cells which need to be highlighted
highlight_new_data(source_data = df, new_data, workbook = wb, style = highlight_style)
}
# Highlight new columns
# if(Sys.Date() =="2020-06-22"){
# new_cols = which(str_detect(colnames(df_renamed), "Derived"))
# walk(new_cols,
# function(col) addStyle(wb = wb, sheet = "Deal Data", style = highlight_style, rows = 2:nrow(df_renamed), cols = col, stack = T))
# }
# Format dollar amounts
addStyle(wb = wb, sheet = "Deal Data", style = dollar_style, rows = 2:(nrow(df)+1), cols = which(colnames(df) == "effective_rent_per_sqft"), stack=T)
# Data Validation --------------------------------------------------------------------------------
# Values and colours
dv_vals = c("In Progress", "Entered", "Verified", "Issues (wait for lease)", "Already in system")
dv_cols = c("#FFDD00", "#00b2dd", "#00a657", "#F58220", "#69BE28")
if(nrow(df)>0){
add_data_validation(wb, df, dv_vals, dv_cols)
}
# Tracking Sheet ---------------------------------------------------------------------------------
addWorksheet(wb = wb, sheetName = "Tracking", tabColour = "#69BE28")
header_row = 3
# Count of statuses
status_tab = tibble(
Status = dv_vals) %>%
mutate(Count = str_c("COUNTIF('Deal Data'!$B:$B, $A$", (header_row+1):(nrow(.)+header_row), ")")) %>%
add_row(
Status = "Not entered",
Count = str_c(nrow(df_renamed), "-COUNTA('Deal Data'!$A:$A)"))
# Count of statuses by researcher
status_researcher_tab = crossing(distinct(df_renamed, Researcher), Status = dv_vals) %>%
filter(!is.na(Researcher)) %>%
when(nrow(.) > 0 ~ (.) %>%
mutate(Count = str_c("COUNTIFS('Deal Data'!$A:$A, $E", (header_row+1):(nrow(.)+header_row),
", 'Deal Data'!$B:$B, $F", (header_row+1):(nrow(.)+header_row), ")")),
~ (.))
if(nrow(status_researcher_tab) > 0){
class(status_researcher_tab$Count) <- c(class(status_researcher_tab$Count), "formula")
}
if(nrow(status_tab) > 0){
class(status_tab$Count) <- c(class(status_tab$Count), "formula")
}
writeData(wb = wb,
x = status_tab,
sheet = "Tracking",
startCol = 1, startRow = header_row,
headerStyle = header_style, borders = "all")
note_style = createStyle(fgFill = "yellow", textDecoration = "bold")
writeData(wb = wb,
sheet = "Tracking",
x = "Note: If researcher is not already listed below, copy the cells of another researcher (e.g. E3:G7) and change the name",
startCol = 5,
colNames = F)
mergeCells(wb = wb, sheet = "Tracking", cols = 5:14, rows = 1)
addStyle(wb = wb, style = note_style, sheet = "Tracking", rows = 1, cols = 5)
writeData(wb = wb,
x = status_researcher_tab,
sheet = "Tracking",
startCol = 5, startRow = header_row,
headerStyle = header_style, borders = "all")
setColWidths(wb = wb, sheet = "Tracking", widths = c(rep("auto", 4), 20, rep("auto", 10)), cols = 1:15)
# Update Excel File ------------------------------------------------------------------------------
deal_info(logger, data_specifics = data_specs, "Writing Excel file")
if(update){
# browser()
box_write(wb,
dir_id = input_file$folder,
file_name = paste0(input_file$market, "_",
input_file$type, "Comp_",
property_type, "_Deal", ".xlsx"),
description = paste(input_file$market, property_type, input_file$type,
"data in Deal, cleaned to represent considerations and escalations in base rent terms.",
"Created at", now()),
write_fun = saveWorkbook, overwrite = T)
deal_info(logger, data_specifics = data_specs, str_c("Updated Excel file."))
# Writing a new file
} else {
box_write(wb,
dir_id = input_file$folder,
file_name = paste0(input_file$market, "_",
input_file$type, "Comp_",
property_type, "_Deal", ".xlsx"),
description = paste(input_file$market, property_type, input_file$type,
"data in Deal, cleaned to represent considerations and escalations in base rent terms.",
"Created at", now()),
write_fun = saveWorkbook, overwrite = T)
deal_info(logger, data_specifics = data_specs, str_c("Created new Excel file."))
}
}
add_data_validation = function(wb, df, dv_vals, dv_cols){
addWorksheet(wb, "DataValidationList", visible = F)
writeData(wb, sheet = 2, x = dv_vals, colNames = F)
dataValidation(wb, sheet = 1,
rows = 2:(nrow(df)+1),
cols = which(colnames(df) == "Status"),
type = "list",
value = "DataValidationList!$A$1:$A$5")
# Conditional Formatting
walk2(dv_vals, dv_cols, function(text, colour){
cell_style = createStyle(bgFill = colour)
conditionalFormatting(wb, sheet = 1, rows = 2:(nrow(df)+1),
cols = which(colnames(df) == "Status"),
rule = text,
type = "contains",
style = cell_style)
})
}
# Aggregate functions -----------------------------------------------------------------------------
compare_and_update = function(input_file, type, cleaned_extract, box = F, testing = T, column_names,
logger, data_specs){
# This function compares the cleaned extract with our previous property type file
# pull market name
data_specs = data_specs %>%
append(list(property_type = type))
property_type_data = cleaned_extract %>%
filter(str_detect(property_type, type)) %>%
mutate(across(everything(),
as.character))
if(nrow(property_type_data) == 0){
deal_info(logger, data_specifics = data_specs, "No relevant data in input file")
return()
}
if(box){
box_dir = as_tibble(box_search_folders(input_file$market, ancestor_folder_ids = 102452968963)) %>%
filter(str_detect(path, "test", negate = !testing))
input_file$folder = box_dir$id
if(nrow(box_dir) > 1){
stop("Found multiple directories")
}
# Check if the directory exists, if it doesn't => create it
if(nrow(box_dir) < 1) {
message("Could not find a directory for ", input_file$market)
boxr::box_dir_create(dir_name = paste(input_file$market, "Deal "), parent_dir_id = 102452968963)
box_dir = as_tibble(box_search_folders(input_file$market, ancestor_folder_ids = 102452968963)) %>%
filter(str_detect(path, "test", negate = !testing))
input_file$folder = box_dir$id
}
# Set output location ----------------------------------------------------------------------
# browser()
output_file_info = as_tibble(box_ls(box_dir$id)) %>%
filter(str_detect(name, input_file$type),
str_detect(name, !!type)) %>%
as.list()
} else {
## - Create the output file name
output_file_info = str_c(here("Deal CompExtracts",
paste(data_specs$market, "Deal"),
str_extract(input_file, "(?<=/)[A-z_\\s]+(?=_Current)")), "_", data_specs$property_type, ".xlsx")
}
# Update the historical file --------------------------------------------------------
# Check if file exists
if( (length(compact(output_file_info)) > 0) ){
# 1. Pull in previous file
previous_file = pull_previous_file(output_file_info, current_data = property_type_data, box,
logger, data_specs)
# 2. Identify which records from the extract are new
new_records = identify_new(current_extract = property_type_data, historical_df = previous_file, logger, data_specs)
# 3. Identify which records from the extract are old
existing_records = identify_existing(current_extract = property_type_data, new_records_df = new_records, historical_df = previous_file, logger, data_specs)
# 4. Identify differences in existing records between the extract and the previous file
differences = identify_differences(existing_records_df = existing_records, logger, data_specs)
record_changes = identify_changes(differences_df = differences, logger, data_specs)
# 5. Build final dataframe
updated_df = build_final_df(history_df = previous_file,
changes = record_changes,
new_records_df = new_records, logger, data_specs)
# Output the resulting data to Excel -------------------------------------------------------------
# Provided there is at least 1 row
if(nrow(updated_df) < 1){
invisible()
} else {
output_to_excel(df = updated_df,
input = input_file,
output = output_file_info,
property_type = type,
new_data = differences,
update = T,
human_cols = column_names,
logger = logger,
data_specs = data_specs)
}
} else {
## If file does not exist:
deal_info(logger, data_specifics = data_specs, "Creating file")
# output the file to Excel
output_to_excel(df = property_type_data,
input = input_file,
output = output_file_info,
property_type = type,
human_cols = column_names,
logger = logger, data_specs = data_specs)
}
}
clean_deal_files = function(deal_data){
clean_deal_df = basic_cleaning_box(deal_data)
# Pull the raw column names
original_cols = readable_colnames(deal_data)
# Transform the considerations columns
deal_info(logger, "Transform considerations")
convert_considerations(clean_deal_df) %>%
select(-Researcher, -Status, -`New?`)
}
generate_from_input_files = function(file,
prop_types = c("Office","Industrial","Retail","Miscellaneous","Housing","Land"),
testing = T,
box = F){
# This function performs all the actions on a raw Deal Extract file
if(box){
# Print the market name
data_specs = list(market = file$market,
transaction_type = file$type)
deal_info(logger, data_specifics = data_specs, paste("File: ", file$name))
raw_deal_df = read_deal_data(file, box=T)
clean_deal_df = basic_cleaning_box(raw_deal_df)
# Pull the raw column names
original_cols = readable_colnames(raw_deal_df)
# Transform the considerations columns
deal_info(logger, data_specifics = data_specs, "Transform considerations")
converted_deal_df = convert_considerations(clean_deal_df)
# Compare the Deal data to last week and update each property type file
walk(prop_types,
~compare_and_update(input_file = file,
box = T,
type = .,
cleaned_extract = converted_deal_df,
testing = testing,
column_names = original_cols,
logger,
data_specs))