Train and Predict in Exasol
In this tutorial you will learn how to use a UDF to train and predict in R inside your Exasol cluster.
Prerequisites
You must have completed the Train Locally and Predict in Exasol tutorial.
Read data and train model
The first step is to create a UDF script that reads data from the Exasol database into an R data.frame
object, trains a basic random forest model, and then uploads the model to BucketFS.
Values in angle brackets such as <bucket_name>
are placeholders. Replace these with your own values.
TrainInExasol2 <- exa.createScript(
exaconn,
"r_demo.dt_train2",
function(data) {
# Load the required packages
library(randomForest)
library(RCurl)
# Load data in chunks of 1000 rows at a time (very useful knob with big data!)
repeat{
if (!data$next_row(1000))
break
# Transform into data frame
df <- data.frame(
id = data$id,
crim = data$crim,
zn = data$zn,
indus = data$indus,
chas = data$chas,
nox = data$nox,
rm = data$rm,
age = data$age,
dis = data$dis,
rad = data$rad,
tax = data$tax,
ptratio = data$ptratio,
b = data$b,
lstat = data$lstat,
medv = data$medv,
split = data$split
)
}
# Run random forest
set.seed(42)
rf_model2 <- randomForest(medv ~ .-id -split,
data = df[df$split=="Train",],
importance = TRUE)
curl_opts = curlOptions(userpwd = "w:<Bucket_write_password>",
verbose = TRUE,
httpauth = AUTH_BASIC)
RCurl:::httpPUT(
url = "http://<Exasol_server>:<BucketFS_service_port>/<Bucket_name>/rf_model2",
content = serialize(rf_model2, ascii = FALSE, connection = NULL),
curl = getCurlHandle(.opts = curl_opts)
)
# Return the line number
data$emit(rownames(rf_model2$importance),
unname(rf_model2$importance[,1]),
unname(rf_model2$importance[,2]))
},
# Input arguments
inArgs = c("id INT",
"crim DOUBLE",
"zn DOUBLE",
"indus DOUBLE",
"chas VARCHAR(10)",
"nox DOUBLE",
"rm DOUBLE",
"age DOUBLE",
"dis DOUBLE",
"rad DOUBLE",
"tax DOUBLE",
"ptratio DOUBLE",
"b DOUBLE",
"lstat DOUBLE",
"medv DOUBLE",
"split VARCHAR(10)"),
# Output arguments
outArgs = c("Variable CHAR(20)",
"MSE DOUBLE",
"NodePurity DOUBLE")
)
# Call of the function. The return is stored in an object
variable_importance = TrainInExasol2("id","crim", "zn", "indus", "chas", "nox", "rm", "age", "dis",
"rad", "tax", "ptratio", "b", "lstat", "medv", "split",
table = "r_demo.boston_housing",
groupBy = "iproc(),mod(rownum,5)",
where = "split = 'Train'")
Predict in Exasol
Now that you have the model rf_model2
stored in BucketFS, you can use it to make predictions on the test data in the Exasol database.
# Create UDF for prediction
PredictInExasol2 <- exa.createScript(
exaconn,
"r_demo.dt_predict2",
function(data) {
# Load the required packages
require(RCurl)
require(randomForest)
# Load data in chunks of 1000 rows at a time
repeat{
if (!data$next_row(1000))
break
# Load the data into a data.frame
df <- data.frame(id = data$id,
crim = data$crim,
zn = data$zn,
indus = data$indus,
chas = data$chas,
nox = data$nox,
rm = data$rm,
age = data$age,
dis = data$dis,
rad = data$rad,
tax = data$tax,
ptratio = data$ptratio,
b = data$b,
lstat = data$lstat,
medv = data$medv,
split = data$split)
}
# Load the model from the bucket (note that is unserialized)
rf_model2 = unserialize(file("/buckets/<BucketFS_service_name>/<Bucket_name>/rf_model2", "rb"))
# Use the loaded model to make the prediction
prediction <- predict(rf_model2, newdata = df)
# Return the forecast
data$emit(df$id, df$medv, prediction)
},
# Input arguments
inArgs = c("id INT",
"crim DOUBLE",
"zn DOUBLE",
"indus DOUBLE",
"chas VARCHAR(10)",
"nox DOUBLE",
"rm DOUBLE",
"age DOUBLE",
"dis DOUBLE",
"rad DOUBLE",
"tax DOUBLE",
"ptratio DOUBLE",
"b DOUBLE",
"lstat DOUBLE",
"medv DOUBLE",
"split VARCHAR(10)"),
# Output arguments
outArgs = c("id INT",
"RealValue DOUBLE",
"Prediction DOUBLE")
)
# Create a table with the real values and predicted ones
prediction_output2 = PredictInExasol2("id","crim", "zn", "indus", "chas", "nox", "rm", "age", "dis",
"rad", "tax", "ptratio", "b", "lstat", "medv", "split",
table = "r_demo.boston_housing",
groupBy = "iproc(),mod(iproc(),5)",
where = "split = 'Test'")
# Check the root mean squared error (RMSE)
RMSE = sqrt(mean((prediction_output2$PREDICTION - prediction_output2$REALVALUE)^2))
# 3.27, similar to the one above