project:hotornot:r [make.opendata.ch wiki]

This is an old revision of the document!

R Analysis

The sample data was analysed with the RStudio Version 0.99.489 by using the randomForest package version 4.6-12 to determine which variables were most important.

The source code is show below. An explaination follows.

# load the raw data
# note: randomForst does not allow NA    na.strings = c("9999", "notset"
raw = read.csv (file="RawData.csv", header = TRUE, sep = ",", quote = "\"", dec = ".", fileEncoding = "UTF-8", encoding="UTF-8", stringsAsFactors = TRUE)
meta = read.csv (file="MetaData.csv", header = TRUE, sep = ",", quote = "\"", dec = ".", fileEncoding = "UTF-8", encoding="UTF-8", stringsAsFactors = FALSE)

# eliminate rows where data is not available
raw = raw[complete.cases(raw),]

# eliminate columns on type of water heating
raw = subset (raw, select = -WaterHeatingType)

# eliminate rows where water heater is solar
# raw = raw[as.character(raw$WaterHeatingType) != "solar",]

# eliminate columns where importance MeanDeacreaseInAccuracy is small
raw = subset (raw, select = -QSSmart)
raw = subset (raw, select = -HasSmartMeter)
raw = subset (raw, select = -QSLamps)
raw = subset (raw, select = -QSGender)
raw = subset (raw, select = -SentInvitesAccepted)
raw = subset (raw, select = -InvitesAccepted)
raw = subset (raw, select = -QSAge)
raw = subset (raw, select = -NumMeters)
raw = subset (raw, select = -SavingTipStatusCount_1)
raw = subset (raw, select = -SavingTipStatusCount_3)
raw = subset (raw, select = -InvitesSent)
raw = subset (raw, select = -QSEcoEnergy)
raw = subset (raw, select = -QuizAnswered)
raw = subset (raw, select = -DaysLoggedIn)
raw = subset (raw, select = -LotteriesParticipated)
raw = subset (raw, select = -NumAppliancesEntered)
raw = subset (raw, select = -HouseholdType)
raw = subset (raw, select = -SavingTipStatusCount_2)
# raw = subset (raw, select = -HouseholdMembers)
# raw = subset (raw, select = -HadAudit)
raw = subset (raw, select = -MobilePhoneEmpty)
raw = subset (raw, select = -NumDevices)
# raw = subset (raw, select = -QSKnowMeter)
raw = subset (raw, select = -DaysReadingEntered)
raw = subset (raw, select = -ReadingCount)
raw = subset (raw, select = -WeeksReadingEntered)
raw = subset (raw, select = -QSInterest)
raw = subset (raw, select = -QuizAnsweredCorrectly)
raw = subset (raw, select = -NumWeeksMember)
raw = subset (raw, select = -NumDaysMember)
raw = subset (raw, select = -NumVisit)
raw = subset (raw, select = -WeeksLoggedIn)
raw = subset (raw, select = -Points)
# raw = subset (raw, select = -LivingArea)
# raw = subset (raw, select = -MainHeatingType)

# split the variable of interest out of the data.frame, change to factors for classification
solar = factor (ifelse (raw["QSSolar"] == 28, TRUE, FALSE))

# make a new data.frame without the solar variable
data = subset (raw, select = -QSSolar)

# load the analysis package
library (randomForest)

# run the analysis
forest = randomForest (x = data, y = solar, importance = TRUE, proximity = TRUE, do.trace = TRUE)

# show some results
plot (forest, log="y")
MDSplot(forest,solar)

imp = importance (forest)

Steps

The data set was loaded into R using the built in CSV reader.

Since randomForest does not operate when there are NA values, only complete cases were retained.