Train and Predict in Exasol
You can now use the UDF to train your model in Exasol. Inside the UDF you read the data from the database into a data.frame
object which is used to run the model. You can then send this model to BucketFS for future usage and return some model information (variable importance in this case). Replace <Bucket_write_password>, <Exasol_server>, <BucketFS_service_port>, <BucketFS_service_name>, and <Bucket_name> with actual values.
TrainInExasol2 <- exa.createScript(
exaconn,
"r_demo.dt_train2",
function(data) {
# Load the required packages
library(randomForest)
library(RCurl)
# Load data in chunks of 1000 rows at a time (very useful knob with big data!)
repeat{
if (!data$next_row(1000))
break
# Transform into data frame
df <- data.frame(
id = data$id,
crim = data$crim,
zn = data$zn,
indus = data$indus,
chas = data$chas,
nox = data$nox,
rm = data$rm,
age = data$age,
dis = data$dis,
rad = data$rad,
tax = data$tax,
ptratio = data$ptratio,
b = data$b,
lstat = data$lstat,
medv = data$medv,
split = data$split
)
}
# Run random forest
set.seed(42)
rf_model2 <- randomForest(medv ~ .-id -split,
data = df[df$split=="Train",],
importance = TRUE)
curl_opts = curlOptions(userpwd = "w:<Bucket_write_password>",
verbose = TRUE,
httpauth = AUTH_BASIC)
RCurl:::httpPUT(
url = "http://<Exasol_server>:<BucketFS_service_port>/<Bucket_name>/rf_model2",
content = serialize(rf_model2, ascii = FALSE, connection = NULL),
curl = getCurlHandle(.opts = curl_opts)
)
# Return the line number
data$emit(rownames(rf_model2$importance),
unname(rf_model2$importance[,1]),
unname(rf_model2$importance[,2]))
},
# Input arguments
inArgs = c("id INT",
"crim DOUBLE",
"zn DOUBLE",
"indus DOUBLE",
"chas VARCHAR(10)",
"nox DOUBLE",
"rm DOUBLE",
"age DOUBLE",
"dis DOUBLE",
"rad DOUBLE",
"tax DOUBLE",
"ptratio DOUBLE",
"b DOUBLE",
"lstat DOUBLE",
"medv DOUBLE",
"split VARCHAR(10)"),
# Output arguments
outArgs = c("Variable CHAR(20)",
"MSE DOUBLE",
"NodePurity DOUBLE")
)
# Call of the function. The return is stored in an object
variable_importance = TrainInExasol2("id","crim", "zn", "indus", "chas", "nox", "rm", "age", "dis",
"rad", "tax", "ptratio", "b", "lstat", "medv", "split",
table = "r_demo.boston_housing",
groupBy = "iproc(),mod(rownum,5)",
where = "split = 'Train'")
Now that you have model, rf_model2
, stored in BucketFS, you can use it to make predictions on the test data which is in Exasol database. The testing procedure is the same as in the scenario above.
# Create UDF for prediction
PredictInExasol2 <- exa.createScript(
exaconn,
"r_demo.dt_predict2",
function(data) {
# Load the required packages
require(RCurl)
require(randomForest)
# Load data in chunks of 1000 rows at a time
repeat{
if (!data$next_row(1000))
break
# Load the data into a data.frame
df <- data.frame(id = data$id,
crim = data$crim,
zn = data$zn,
indus = data$indus,
chas = data$chas,
nox = data$nox,
rm = data$rm,
age = data$age,
dis = data$dis,
rad = data$rad,
tax = data$tax,
ptratio = data$ptratio,
b = data$b,
lstat = data$lstat,
medv = data$medv,
split = data$split)
}
# Load the model from the bucket (note that is unserialized)
rf_model2 = unserialize(file("/buckets/<BucketFS_service_name>/<Bucket_name>/rf_model2", "rb"))
# Use the loaded model to make the prediction
prediction <- predict(rf_model2, newdata = df)
# Return the forecast
data$emit(df$id, df$medv, prediction)
},
# Input arguments
inArgs = c("id INT",
"crim DOUBLE",
"zn DOUBLE",
"indus DOUBLE",
"chas VARCHAR(10)",
"nox DOUBLE",
"rm DOUBLE",
"age DOUBLE",
"dis DOUBLE",
"rad DOUBLE",
"tax DOUBLE",
"ptratio DOUBLE",
"b DOUBLE",
"lstat DOUBLE",
"medv DOUBLE",
"split VARCHAR(10)"),
# Output arguments
outArgs = c("id INT",
"RealValue DOUBLE",
"Prediction DOUBLE")
)
# Create a table with the real values and predicted ones
prediction_output2 = PredictInExasol2("id","crim", "zn", "indus", "chas", "nox", "rm", "age", "dis",
"rad", "tax", "ptratio", "b", "lstat", "medv", "split",
table = "r_demo.boston_housing",
groupBy = "iproc(),mod(iproc(),5)",
where = "split = 'Test'")
# Check the root mean squared error (RMSE)
RMSE = sqrt(mean((prediction_output2$PREDICTION - prediction_output2$REALVALUE)^2))
# 3.27, similar to the one above