Train and Predict in Exasol

In this tutorial you will learn how to use a UDF to train and predict in R inside your Exasol cluster.

Prerequisites

You must have completed the Train Locally and Predict in Exasol tutorial.

Read data and train model

The first step is to create a UDF script that reads data from the Exasol database into an R data.frame object, trains a basic random forest model, and then uploads the model to BucketFS.

Values in angle brackets such as <bucket_name> are placeholders. Replace these with your own values.

TrainInExasol2 <- exa.createScript(
  exaconn,
  "r_demo.dt_train2",
  function(data) {

    # Load the required packages
    library(randomForest)
    library(RCurl)

    # Load data in chunks of 1000 rows at a time (very useful knob with big data!)
    repeat{
        if (!data$next_row(1000))
            break

      # Transform into data frame
      df <- data.frame(
              id       = data$id,
              crim     = data$crim,
              zn       = data$zn,
              indus    = data$indus,
              chas     = data$chas,
              nox      = data$nox,
              rm       = data$rm,
              age      = data$age,
              dis      = data$dis,
              rad      = data$rad,
              tax      = data$tax,
              ptratio  = data$ptratio,
              b        = data$b,
              lstat    = data$lstat,
              medv     = data$medv,
              split    = data$split
            )
    }

    # Run random forest
    set.seed(42)
    rf_model2 <- randomForest(medv ~ .-id -split,
                              data = df[df$split=="Train",],
                              importance = TRUE)

    curl_opts = curlOptions(userpwd  = "w:<Bucket_write_password>",
                            verbose  = TRUE,
                            httpauth = AUTH_BASIC)
    RCurl:::httpPUT(
      url     = "http://<Exasol_server>:<BucketFS_service_port>/<Bucket_name>/rf_model2",
      content = serialize(rf_model2, ascii = FALSE, connection = NULL),
      curl    = getCurlHandle(.opts = curl_opts)
    )

    # Return the line number
    data$emit(rownames(rf_model2$importance),
              unname(rf_model2$importance[,1]),
              unname(rf_model2$importance[,2]))
  },

  # Input arguments
  inArgs  = c("id         INT",
              "crim       DOUBLE",
              "zn         DOUBLE",
              "indus      DOUBLE",
              "chas       VARCHAR(10)",
              "nox        DOUBLE",
              "rm         DOUBLE",
              "age        DOUBLE",
              "dis        DOUBLE",
              "rad        DOUBLE",
              "tax        DOUBLE",
              "ptratio    DOUBLE",
              "b          DOUBLE",
              "lstat      DOUBLE",
              "medv       DOUBLE",
              "split      VARCHAR(10)"),

  # Output arguments
  outArgs = c("Variable   CHAR(20)",
              "MSE        DOUBLE",
              "NodePurity DOUBLE")
)

# Call of the function. The return is stored in an object
variable_importance = TrainInExasol2("id","crim", "zn", "indus", "chas", "nox", "rm", "age", "dis",
                                     "rad", "tax", "ptratio", "b", "lstat", "medv", "split",
                                     table = "r_demo.boston_housing",
                                     groupBy = "iproc(),mod(rownum,5)",
                                     where = "split = 'Train'")

Predict in Exasol

Now that you have the model rf_model2 stored in BucketFS, you can use it to make predictions on the test data in the Exasol database.

# Create UDF for prediction
PredictInExasol2 <- exa.createScript(
  exaconn,
  "r_demo.dt_predict2",
  function(data) {

    # Load the required packages
    require(RCurl)
    require(randomForest)

    # Load data in chunks of 1000 rows at a time
    repeat{
        if (!data$next_row(1000))
            break

      # Load the data into a data.frame
      df <- data.frame(id       = data$id,
                       crim     = data$crim,
                       zn       = data$zn,
                       indus    = data$indus,
                       chas     = data$chas,
                       nox      = data$nox,
                       rm       = data$rm,
                       age      = data$age,
                       dis      = data$dis,
                       rad      = data$rad,
                       tax      = data$tax,
                       ptratio  = data$ptratio,
                       b        = data$b,
                       lstat    = data$lstat,
                       medv     = data$medv,
                       split    = data$split)
    }

    # Load the model from the bucket (note that is unserialized)
    rf_model2 = unserialize(file("/buckets/<BucketFS_service_name>/<Bucket_name>/rf_model2", "rb"))
    
    # Use the loaded model to make the prediction
    prediction <- predict(rf_model2, newdata = df)

    # Return the forecast
    data$emit(df$id, df$medv, prediction)
  },

  # Input arguments
  inArgs  = c("id         INT",
              "crim       DOUBLE",
              "zn         DOUBLE",
              "indus      DOUBLE",
              "chas       VARCHAR(10)",
              "nox        DOUBLE",
              "rm         DOUBLE",
              "age        DOUBLE",
              "dis        DOUBLE",
              "rad        DOUBLE",
              "tax        DOUBLE",
              "ptratio    DOUBLE",
              "b          DOUBLE",
              "lstat      DOUBLE",
              "medv       DOUBLE",
              "split      VARCHAR(10)"),

  # Output arguments
  outArgs = c("id         INT",
              "RealValue  DOUBLE",
              "Prediction DOUBLE")
)

# Create a table with the real values and predicted ones
prediction_output2 = PredictInExasol2("id","crim", "zn", "indus", "chas", "nox", "rm", "age", "dis",
                                      "rad", "tax", "ptratio", "b", "lstat", "medv", "split",
                                      table = "r_demo.boston_housing",
                                      groupBy = "iproc(),mod(iproc(),5)",
                                      where = "split = 'Test'")

# Check the root mean squared error (RMSE)
RMSE = sqrt(mean((prediction_output2$PREDICTION - prediction_output2$REALVALUE)^2))
# 3.27, similar to the one above