Data Preprocessing

property = read_csv(file = "properties_2016.csv", header = T, sep = ",")
Error in read_csv(file = "properties_2016.csv", header = T, sep = ",") : 
  unused arguments (header = T, sep = ",")

Change Variable Types

### Some variables are seen as numerical values, but they should be categorical, hence transformation 
change_numerical_to_factor = function(data, variable_names) {
  for (i in 1:length(variable_names)) {
    index = which(names(data) == cate_var[i])
    if (length(index) > 0) {
      data[,index] = as.factor(data[,index])
    }
  }
  return(data)
}
cate_var = c( "buildingqualitytypeid", "fips", "heatingorsystemtypeid", "propertylandusetypeid", 'censustractandblock', 
              'regionidcity', 'regionidcounty', 'regionidcity', "unitcnt")
train = change_numerical_to_factor(train, cate_var)
property = change_numerical_to_factor(property, cate_var)

Feature Engineering

#### month of the year shows predicative power, so, use month column 
#Take in data and create features.  Use this for the train and total properties files
#This is EXCLUDING the month features which need to be added later
make_features = function(data) {
  data %>% 
    mutate(#transactiondate = as.Date(as.character(transactiondate)),
           #month = as.factor(month(transactiondate)),
           N_value_ratio = taxvaluedollarcnt/taxamount,
           N_living_area_prop = calculatedfinishedsquarefeet/lotsizesquarefeet) %>%
    
    group_by(regionidcity) %>%
    
    mutate( N_Avg_structuretaxvalue = mean(structuretaxvaluedollarcnt),
            N_city_count = n()) %>%
    ungroup() %>%
    
    mutate( N_Dev_structuretaxvaluedolarcnt = abs(structuretaxvaluedollarcnt - N_Avg_structuretaxvalue)/N_Avg_structuretaxvalue,
            N_tax_score = taxvaluedollarcnt*taxamount)  %>%
    
    group_by(regionidzip) %>%
    
    mutate(N_zip_count = n()) %>%
    ungroup() %>%
    
    dplyr::select(- rawcensustractandblock,
           - propertyzoningdesc,
           - censustractandblock,
           - assessmentyear) ### Remove transactiondate column 
  
  #data$N_month = as.factor(data$N_month)
}
train = train %>%
  mutate(transactiondate = as.Date(as.character(transactiondate))) %>% 
    mutate(month = as.factor(month(transactiondate)))  %>% 
    select(-one_of("transactiondate"))
Error in mutate_impl(.data, dots) : 
  Evaluation error: object 'transactiondate' not found.

Remove NA values

prop2 = property %>% dplyr::select(-drop.column)   
Error in -drop.column : invalid argument to unary operator

Remove outliers observations

Model

number_trees = 200
gbmModel <- gbm(logerror ~ ., 
                distribution="gaussian", 
                # var.monotone=c(0,0,0,0,0,0),  # -1: monotone decrease,
                # +1: monotone increase,
                #  0: no monotone restrictions
                interaction.depth = 5,          ### 1 means additive model, 2 is two-way interaction 
                # n.minobsinnode = 10,         ### minimum number of observations in the trees terminal nodes.
                n.cores=detectCores()/2,      ### Number of cores for parallasation
                bag.fraction = 0.8,          ### Every time only x% of the sample are selected to make the tree
                n.trees = number_trees,              ### Number of trees 
                shrinkage = 0.033,           ### There is a shrinkage parameter to avoid over-fitting 
                data = train)

Predictions

LS0tDQp0aXRsZTogIlppbGxvdyBHcmFkaWVudCBCb29zdGVkIE1hY2hpbmUiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQojI0RhdGEgUHJlcHJvY2Vzc2luZw0KYGBge3J9DQoNCmxpYnJhcnkocmVhZHIpICMgQ1NWIGZpbGUgSS9PLCBlLmcuIHRoZSByZWFkX2NzdiBmdW5jdGlvbg0KDQojIyBSZWFkIHBhY2thZ2VzIChzb21lIG9mIHRoZSBwYWNrYWdlcyBhcmUgbm90IHJlcXVpcmVkLCBidXQgZGlkIG5vdCBib3RoZXIgdG8gZmluZCBvdXQgd2hpY2ggb25lcyApDQpwYWNrYWdlcyA8LSBjKCJnYm0iLCAieGdib29zdCIsICJkcGx5ciIsICJkYXRhLnRhYmxlIiwgImNhcmV0IiwgInJhdHRsZSIsICJ0aWR5ciIsICJnZ3Bsb3QyIiwgImx1YnJpZGF0ZSIsIA0KICAgICAgICAgICAgICAiY29ycnBsb3QiLCAibGVhZmxldCIsICJjYXJldEVuc2VtYmxlIiwgImUxMDcxIiwgInJwYXJ0LnBsb3QiLCAiVkdBTSIsICJNZXRyaWNzIiwgIk1hdHJpeCIpDQpwdXJycjo6d2FsayhwYWNrYWdlcywgbGlicmFyeSwgY2hhcmFjdGVyLm9ubHkgPSBUUlVFLCB3YXJuLmNvbmZsaWN0cyA9IEZBTFNFKQ0KDQojT25seSB0aG9zZSB3aGljaCB3ZXJlIHNvbGQNCnRyYWluX3RlbXAgPSByZWFkLmNzdigidHJhaW5fMjAxNl92Mi5jc3YiLCBoZWFkZXIgPSBULCBzZXAgPSAgIiwiKQ0KDQojQWxsIHByb3BlcnRpZXMgbGlzdGVkDQpwcm9wZXJ0eSA9IHJlYWQuY3N2KGZpbGUgPSAicHJvcGVydGllc18yMDE2LmNzdiIsIGhlYWRlciA9IFQsIHNlcCA9ICIsIikNCg0KI0pvaW4gWSBhbmQgWCB0byBmaXQgYSBtb2RlbA0KdHJhaW4gPSBtZXJnZSh0cmFpbl90ZW1wLCBwcm9wZXJ0eSwgYnkueCA9ICJwYXJjZWxpZCIsIGJ5LnkgPSAicGFyY2VsaWQiLCBhbGwueCA9IFRSVUUpDQoNCiMgUmVtb3ZlIHRlbXBvcmFyeSBkYXRhIHNldCB0byBzYXZlIHNwYWNlDQpybSh0cmFpbl90ZW1wKQ0KYGBgDQojQ2hhbmdlIFZhcmlhYmxlIFR5cGVzDQoNCmBgYHtyfQ0KIyMjIFNvbWUgdmFyaWFibGVzIGFyZSBzZWVuIGFzIG51bWVyaWNhbCB2YWx1ZXMsIGJ1dCB0aGV5IHNob3VsZCBiZSBjYXRlZ29yaWNhbCwgaGVuY2UgdHJhbnNmb3JtYXRpb24gDQpjaGFuZ2VfbnVtZXJpY2FsX3RvX2ZhY3RvciA9IGZ1bmN0aW9uKGRhdGEsIHZhcmlhYmxlX25hbWVzKSB7DQogIGZvciAoaSBpbiAxOmxlbmd0aCh2YXJpYWJsZV9uYW1lcykpIHsNCiAgICBpbmRleCA9IHdoaWNoKG5hbWVzKGRhdGEpID09IGNhdGVfdmFyW2ldKQ0KICAgIGlmIChsZW5ndGgoaW5kZXgpID4gMCkgew0KICAgICAgZGF0YVssaW5kZXhdID0gYXMuZmFjdG9yKGRhdGFbLGluZGV4XSkNCiAgICB9DQogIH0NCiAgcmV0dXJuKGRhdGEpDQp9DQoNCmNhdGVfdmFyID0gYyggImJ1aWxkaW5ncXVhbGl0eXR5cGVpZCIsICJmaXBzIiwgImhlYXRpbmdvcnN5c3RlbXR5cGVpZCIsICJwcm9wZXJ0eWxhbmR1c2V0eXBlaWQiLCAnY2Vuc3VzdHJhY3RhbmRibG9jaycsIA0KICAgICAgICAgICAgICAncmVnaW9uaWRjaXR5JywgJ3JlZ2lvbmlkY291bnR5JywgJ3JlZ2lvbmlkY2l0eScsICJ1bml0Y250IikNCg0KdHJhaW4gPSBjaGFuZ2VfbnVtZXJpY2FsX3RvX2ZhY3Rvcih0cmFpbiwgY2F0ZV92YXIpDQpwcm9wZXJ0eSA9IGNoYW5nZV9udW1lcmljYWxfdG9fZmFjdG9yKHByb3BlcnR5LCBjYXRlX3ZhcikNCmBgYA0KDQojRmVhdHVyZSBFbmdpbmVlcmluZw0KYGBge3J9DQojIyMjIG1vbnRoIG9mIHRoZSB5ZWFyIHNob3dzIHByZWRpY2F0aXZlIHBvd2VyLCBzbywgdXNlIG1vbnRoIGNvbHVtbiANCiNUYWtlIGluIGRhdGEgYW5kIGNyZWF0ZSBmZWF0dXJlcy4gIFVzZSB0aGlzIGZvciB0aGUgdHJhaW4gYW5kIHRvdGFsIHByb3BlcnRpZXMgZmlsZXMNCg0KI1RoaXMgaXMgRVhDTFVESU5HIHRoZSBtb250aCBmZWF0dXJlcyB3aGljaCBuZWVkIHRvIGJlIGFkZGVkIGxhdGVyDQptYWtlX2ZlYXR1cmVzID0gZnVuY3Rpb24oZGF0YSkgew0KICBkYXRhICU+JSANCiAgICBtdXRhdGUoI3RyYW5zYWN0aW9uZGF0ZSA9IGFzLkRhdGUoYXMuY2hhcmFjdGVyKHRyYW5zYWN0aW9uZGF0ZSkpLA0KICAgICAgICAgICAjbW9udGggPSBhcy5mYWN0b3IobW9udGgodHJhbnNhY3Rpb25kYXRlKSksDQogICAgICAgICAgIE5fdmFsdWVfcmF0aW8gPSB0YXh2YWx1ZWRvbGxhcmNudC90YXhhbW91bnQsDQogICAgICAgICAgIE5fbGl2aW5nX2FyZWFfcHJvcCA9IGNhbGN1bGF0ZWRmaW5pc2hlZHNxdWFyZWZlZXQvbG90c2l6ZXNxdWFyZWZlZXQpICU+JQ0KICAgIA0KICAgICMgZ3JvdXBfYnkocmVnaW9uaWRjaXR5KSAlPiUNCiAgICAjIA0KICAgICMgbXV0YXRlKCBOX0F2Z19zdHJ1Y3R1cmV0YXh2YWx1ZSA9IG1lYW4oc3RydWN0dXJldGF4dmFsdWVkb2xsYXJjbnQpLA0KICAgICMgICAgICAgICBOX2NpdHlfY291bnQgPSBuKCkpICU+JQ0KICAgICMgdW5ncm91cCgpICU+JQ0KICAgIA0KICAgIG11dGF0ZSggTl9EZXZfc3RydWN0dXJldGF4dmFsdWVkb2xhcmNudCA9IGFicyhzdHJ1Y3R1cmV0YXh2YWx1ZWRvbGxhcmNudCAtIE5fQXZnX3N0cnVjdHVyZXRheHZhbHVlKS9OX0F2Z19zdHJ1Y3R1cmV0YXh2YWx1ZSwNCiAgICAgICAgICAgIE5fdGF4X3Njb3JlID0gdGF4dmFsdWVkb2xsYXJjbnQqdGF4YW1vdW50KSAgJT4lDQogICAgDQogICAgZ3JvdXBfYnkocmVnaW9uaWR6aXApICU+JQ0KICAgIA0KICAgIG11dGF0ZShOX3ppcF9jb3VudCA9IG4oKSkgJT4lDQogICAgdW5ncm91cCgpICU+JQ0KICAgIA0KICAgIGRwbHlyOjpzZWxlY3QoLSByYXdjZW5zdXN0cmFjdGFuZGJsb2NrLA0KICAgICAgICAgICAtIHByb3BlcnR5em9uaW5nZGVzYywNCiAgICAgICAgICAgLSBjZW5zdXN0cmFjdGFuZGJsb2NrLA0KICAgICAgICAgICAtIGFzc2Vzc21lbnR5ZWFyKSAjIyMgUmVtb3ZlIHRyYW5zYWN0aW9uZGF0ZSBjb2x1bW4gDQogIA0KICAjZGF0YSROX21vbnRoID0gYXMuZmFjdG9yKGRhdGEkTl9tb250aCkNCn0NCg0KdHJhaW4gPSB0cmFpbiAlPiUNCiAgbXV0YXRlKHRyYW5zYWN0aW9uZGF0ZSA9IGFzLkRhdGUoYXMuY2hhcmFjdGVyKHRyYW5zYWN0aW9uZGF0ZSkpKSAlPiUgDQogICAgbXV0YXRlKG1vbnRoID0gYXMuZmFjdG9yKG1vbnRoKHRyYW5zYWN0aW9uZGF0ZSkpKSAgJT4lIA0KICAgIHNlbGVjdCgtb25lX29mKCJ0cmFuc2FjdGlvbmRhdGUiKSkNCg0KdHJhaW4yID0gbWFrZV9mZWF0dXJlcyh0cmFpbikNCnByb3AgPSBtYWtlX2ZlYXR1cmVzKHByb3BlcnR5KQ0KDQpgYGANCg0KI1JlbW92ZSBOQSB2YWx1ZXMNCmBgYHtyfQ0KIyMjIyMgSW52ZXN0aWdhdGUgdGhlIGNvbHVtbnMgd2l0aCBtYW55IE5BcywgcmVtb3ZlIGNvbHVtbnMgd2l0aCBtb3JlIHRoYW4gODAlIE5BICh0aGlzIG5lZWRzIG1vcmUgaW52ZXN0aWdhdGlvbnMgdGhvdWdoKQ0KIyMjIyBSZW1vdmUgY29sdW1ucyB3aXRoIE5BIG1vcmUgdGhhbiBOQV9wZXJjZW50YWdlX3RocmVzaG9sZCANCnJlbW92ZV92YXJpYWJsZV93aXRoX05BID0gZnVuY3Rpb24oZGF0YSwgTkFfcGVyY2VudGFnZV90aHJlc2hvbGQpIHsNCiAgdmVjdG9yZHJvcCA9IGRhdGFbLCBsYXBwbHkoZGF0YSwgZnVuY3Rpb24oeCkgc3VtKGlzLm5hKHgpKSAvIGxlbmd0aCh4KSApID49IE5BX3BlcmNlbnRhZ2VfdGhyZXNob2xkIF0NCiAgZGF0YVssIHdoaWNoKG5hbWVzKGRhdGEpICVpbiUgbmFtZXModmVjdG9yZHJvcCkpXSA9IE5VTEwNCiAgDQogIHJlc3VsdCA9IGxpc3QoZGF0YSA9IGRhdGEsIGRyb3AuY29sdW1uID0gbmFtZXModmVjdG9yZHJvcCkpDQogIHJldHVybihyZXN1bHQpDQp9DQoNCk5BX3BlcmNlbnRhZ2VfdGhyZXNob2xkID0gMC44DQoNCnJlc3VsdCA9IHJlbW92ZV92YXJpYWJsZV93aXRoX05BKHRyYWluLCBOQV9wZXJjZW50YWdlX3RocmVzaG9sZCkNCnRyYWluID0gcmVzdWx0JGRhdGENCmRyb3AuY29sdW1uID0gIHJlc3VsdCRkcm9wLmNvbHVtbg0KcHJvcCA9IHByb3AgJT4lIGRwbHlyOjpzZWxlY3QoLW9uZV9vZihkcm9wLmNvbHVtbikpICAgDQpgYGANCg0KDQojUmVtb3ZlIG91dGxpZXJzIG9ic2VydmF0aW9ucw0KYGBge3J9DQojIyMgUmVtb3ZlIG91dGxpZXJzICh0aGlzIGFsc28gbmVlZHMgbW9yZSBpbnZlc3RpZ2F0aW9uKQ0KdHJhaW4gPSB0cmFpbiAlPiUgZmlsdGVyKGxvZ2Vycm9yIDw9IDAuNCAmIGxvZ2Vycm9yID49IC0wLjM5KQ0KYGBgDQoNCiMjTW9kZWwNCg0KYGBge3J9DQoNCm51bWJlcl90cmVlcyA9IDYwMA0KDQpnYm1Nb2RlbCA8LSBnYm0obG9nZXJyb3IgfiAuLCANCiAgICAgICAgICAgICAgICBkaXN0cmlidXRpb249ImdhdXNzaWFuIiwgDQogICAgICAgICAgICAgICAgIyB2YXIubW9ub3RvbmU9YygwLDAsMCwwLDAsMCksICAjIC0xOiBtb25vdG9uZSBkZWNyZWFzZSwNCiAgICAgICAgICAgICAgICAjICsxOiBtb25vdG9uZSBpbmNyZWFzZSwNCiAgICAgICAgICAgICAgICAjICAwOiBubyBtb25vdG9uZSByZXN0cmljdGlvbnMNCiAgICAgICAgICAgICAgICBpbnRlcmFjdGlvbi5kZXB0aCA9IDUsICAgICAgICAgICMjIyAxIG1lYW5zIGFkZGl0aXZlIG1vZGVsLCAyIGlzIHR3by13YXkgaW50ZXJhY3Rpb24gDQogICAgICAgICAgICAgICAgIyBuLm1pbm9ic2lubm9kZSA9IDEwLCAgICAgICAgICMjIyBtaW5pbXVtIG51bWJlciBvZiBvYnNlcnZhdGlvbnMgaW4gdGhlIHRyZWVzIHRlcm1pbmFsIG5vZGVzLg0KICAgICAgICAgICAgICAgIG4uY29yZXM9ZGV0ZWN0Q29yZXMoKS8yLCAgICAgICMjIyBOdW1iZXIgb2YgY29yZXMgZm9yIHBhcmFsbGFzYXRpb24NCiAgICAgICAgICAgICAgICBiYWcuZnJhY3Rpb24gPSAwLjgsICAgICAgICAgICMjIyBFdmVyeSB0aW1lIG9ubHkgeCUgb2YgdGhlIHNhbXBsZSBhcmUgc2VsZWN0ZWQgdG8gbWFrZSB0aGUgdHJlZQ0KICAgICAgICAgICAgICAgIG4udHJlZXMgPSBudW1iZXJfdHJlZXMsICAgICAgICAgICAgICAjIyMgTnVtYmVyIG9mIHRyZWVzIA0KICAgICAgICAgICAgICAgIHNocmlua2FnZSA9IDAuMDAzMywgICAgICAgICAgICMjIyBUaGVyZSBpcyBhIHNocmlua2FnZSBwYXJhbWV0ZXIgdG8gYXZvaWQgb3Zlci1maXR0aW5nIA0KICAgICAgICAgICAgICAgIGRhdGEgPSB0cmFpbikNCg0KDQpgYGANCg0KI1ByZWRpY3Rpb25zDQpgYGB7cn0NCiNDcmVhdGUgIm1vbnRoIiBwcm94eSBzbyB0aGF0IHRoZSBYZGF0YSBpbiB0aGUgcHJlZGljdGlvbiBtYXRjaGVzIHRoYXQgaW4gdGhlIHRyYWluaW5nIHNldA0KcHJvcCRtb250aCA9IGZhY3RvcigiMTAiLCBsZXZlbHMgPSBsZXZlbHModHJhaW4kbW9udGgpKQ0KDQpzdWJtaXNzaW9uIDwtIHByb3AgJT4lDQogIG11dGF0ZSgiMjAxNjEwIiA9IHByZWRpY3QuZ2JtKG9iamVjdCA9IGdibU1vZGVsLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbmV3ZGF0YSA9IHByb3AsICNwcm9wIG5lZWRzIHRvIGhhdmUgdGhlIHNhbWUgZmVhdHVyZXMgYXMgbXkgdHJhaW4gZGF0YQ0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBuLnRyZWVzID0gbnVtYmVyX3RyZWVzLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdHlwZSA9ICJyZXNwb25zZSIpLA0KICAgICAgICAgDQogICAgICAgICAjbWFudWFsbHkgc2V0cyB0aGUgbW9udGggdG8gbm92ZW1iZXIgdG8gYmUgdXNlZCBpbiB0aGUgcHJlZGljdCBYIGRhdGEgcHJvcA0KICAgICAgICAgbW9udGggPSBmYWN0b3IoIjExIiwgbGV2ZWxzID0gbGV2ZWxzKHRyYWluJG1vbnRoKSksDQogICAgICAgICANCiAgICAgICAgICIyMDE2MTEiID0gcHJlZGljdC5nYm0ob2JqZWN0ID0gZ2JtTW9kZWwsIG5ld2RhdGEgPSBwcm9wLCBuLnRyZWVzID0gbnVtYmVyX3RyZWVzLCB0eXBlID0gInJlc3BvbnNlIiksIA0KICAgICAgICAgI21hbnVhbGx5IHNldHMgdGhlIG1vbnRoIHRvIGRlY2VtYmVyDQogICAgICAgICBtb250aD1mYWN0b3IoIjEyIiwgbGV2ZWxzID0gbGV2ZWxzKHRyYWluJG1vbnRoKSksDQogICAgICAgICANCiAgICAgICAgICIyMDE2MTIiPXByZWRpY3QuZ2JtKG9iamVjdD1nYm1Nb2RlbCwgbmV3ZGF0YT1wcm9wLCBuLnRyZWVzPW51bWJlcl90cmVlcywgdHlwZT0icmVzcG9uc2UiKSwgDQogICAgICAgICAjbW9udGg9ZmFjdG9yKCIxMCIsIGxldmVscyA9IGxldmVscyh0cmFpbiRtb250aCkpLA0KICAgICAgICAgIjIwMTcxMCI9MCwgDQogICAgICAgICAjbW9udGg9ZmFjdG9yKCIxMSIsIGxldmVscyA9IGxldmVscyh0cmFpbiRtb250aCkpLA0KICAgICAgICAgIjIwMTcxMSI9MCwgDQogICAgICAgICAjbW9udGg9ZmFjdG9yKCIxMiIsIGxldmVscyA9IGxldmVscyh0cmFpbiRtb250aCkpLA0KICAgICAgICAgIjIwMTcxMiI9MCkgJT4lDQogIA0KICBzZWxlY3QocGFyY2VsaWQsIGAyMDE2MTBgLCBgMjAxNjExYCwgYDIwMTYxMmAsIGAyMDE3MTBgLCBgMjAxNzExYCwgYDIwMTcxMmApDQoNCm9wdGlvbnMoc2NpcGVuID0gOTk5KSAjIyBETyBub3QgdXNlIHNjaWVudGlmaWMgbm90YXRpb24gDQp3cml0ZS5jc3Yoc3VibWlzc2lvbiwgInN1Ym1pc3Npb25fd2l0aF9uZXdfZmVhdHVyZXMuY3N2Iiwgcm93Lm5hbWVzID0gRkFMU0UpDQoNCmBgYA0KDQo=