Train and Predict in Exasol

This section explains how to use the UDF to train your model and predict in Exasol.

Inside the UDF you read the data from the database into a data.frame object which is used to run the model. You can then send this model to BucketFS for future usage and return some model information (variable importance in this case).

Replace <Bucket_write_password>, <Exasol_server>, <BucketFS_service_port>, <BucketFS_service_name>, and <Bucket_name> with actual values.

TrainInExasol2 <- exa.createScript(
  exaconn,
  "r_demo.dt_train2",
  function(data) {

    # Load the required packages
    library(randomForest)
    library(RCurl)

    # Load data in chunks of 1000 rows at a time (very useful knob with big data!)
    repeat{
        if (!data$next_row(1000))
            break

      # Transform into data frame
      df <- data.frame(
              id       = data$id,
              crim     = data$crim,
              zn       = data$zn,
              indus    = data$indus,
              chas     = data$chas,
              nox      = data$nox,
              rm       = data$rm,
              age      = data$age,
              dis      = data$dis,
              rad      = data$rad,
              tax      = data$tax,
              ptratio  = data$ptratio,
              b        = data$b,
              lstat    = data$lstat,
              medv     = data$medv,
              split    = data$split
            )
    }

    # Run random forest
    set.seed(42)
    rf_model2 <- randomForest(medv ~ .-id -split,
                              data = df[df$split=="Train",],
                              importance = TRUE)

    curl_opts = curlOptions(userpwd  = "w:<Bucket_write_password>",
                            verbose  = TRUE,
                            httpauth = AUTH_BASIC)
    RCurl:::httpPUT(
      url     = "http://<Exasol_server>:<BucketFS_service_port>/<Bucket_name>/rf_model2",
      content = serialize(rf_model2, ascii = FALSE, connection = NULL),
      curl    = getCurlHandle(.opts = curl_opts)
    )

    # Return the line number
    data$emit(rownames(rf_model2$importance),
              unname(rf_model2$importance[,1]),
              unname(rf_model2$importance[,2]))
  },

  # Input arguments
  inArgs  = c("id         INT",
              "crim       DOUBLE",
              "zn         DOUBLE",
              "indus      DOUBLE",
              "chas       VARCHAR(10)",
              "nox        DOUBLE",
              "rm         DOUBLE",
              "age        DOUBLE",
              "dis        DOUBLE",
              "rad        DOUBLE",
              "tax        DOUBLE",
              "ptratio    DOUBLE",
              "b          DOUBLE",
              "lstat      DOUBLE",
              "medv       DOUBLE",
              "split      VARCHAR(10)"),

  # Output arguments
  outArgs = c("Variable   CHAR(20)",
              "MSE        DOUBLE",
              "NodePurity DOUBLE")
)

# Call of the function. The return is stored in an object
variable_importance = TrainInExasol2("id","crim", "zn", "indus", "chas", "nox", "rm", "age", "dis",
                                     "rad", "tax", "ptratio", "b", "lstat", "medv", "split",
                                     table = "r_demo.boston_housing",
                                     groupBy = "iproc(),mod(rownum,5)",
                                     where = "split = 'Train'")

Now that you have the model rf_model2 stored in BucketFS, you can use it to make predictions on the test data that is in the Exasol database. The testing procedure is the same as in the scenario above.

# Create UDF for prediction
PredictInExasol2 <- exa.createScript(
  exaconn,
  "r_demo.dt_predict2",
  function(data) {

    # Load the required packages
    require(RCurl)
    require(randomForest)

    # Load data in chunks of 1000 rows at a time
    repeat{
        if (!data$next_row(1000))
            break

      # Load the data into a data.frame
      df <- data.frame(id       = data$id,
                       crim     = data$crim,
                       zn       = data$zn,
                       indus    = data$indus,
                       chas     = data$chas,
                       nox      = data$nox,
                       rm       = data$rm,
                       age      = data$age,
                       dis      = data$dis,
                       rad      = data$rad,
                       tax      = data$tax,
                       ptratio  = data$ptratio,
                       b        = data$b,
                       lstat    = data$lstat,
                       medv     = data$medv,
                       split    = data$split)
    }

    # Load the model from the bucket (note that is unserialized)
    rf_model2 = unserialize(file("/buckets/<BucketFS_service_name>/<Bucket_name>/rf_model2", "rb"))
    
    # Use the loaded model to make the prediction
    prediction <- predict(rf_model2, newdata = df)

    # Return the forecast
    data$emit(df$id, df$medv, prediction)
  },

  # Input arguments
  inArgs  = c("id         INT",
              "crim       DOUBLE",
              "zn         DOUBLE",
              "indus      DOUBLE",
              "chas       VARCHAR(10)",
              "nox        DOUBLE",
              "rm         DOUBLE",
              "age        DOUBLE",
              "dis        DOUBLE",
              "rad        DOUBLE",
              "tax        DOUBLE",
              "ptratio    DOUBLE",
              "b          DOUBLE",
              "lstat      DOUBLE",
              "medv       DOUBLE",
              "split      VARCHAR(10)"),

  # Output arguments
  outArgs = c("id         INT",
              "RealValue  DOUBLE",
              "Prediction DOUBLE")
)

# Create a table with the real values and predicted ones
prediction_output2 = PredictInExasol2("id","crim", "zn", "indus", "chas", "nox", "rm", "age", "dis",
                                      "rad", "tax", "ptratio", "b", "lstat", "medv", "split",
                                      table = "r_demo.boston_housing",
                                      groupBy = "iproc(),mod(iproc(),5)",
                                      where = "split = 'Test'")

# Check the root mean squared error (RMSE)
RMSE = sqrt(mean((prediction_output2$PREDICTION - prediction_output2$REALVALUE)^2))
# 3.27, similar to the one above