Binomial classification example in Scala and GLM with H2O
1 min readApr 23, 2017
Here is a sample for binomial classification problem using H2O GLM algorithm using Credit Card data set in Scala language.
The following sample is for multinomial classification problem. This sample is created using Spark 2.1.0 with Sparkling Water 2.1.4.
import org.apache.spark.h2o._
import water.support.SparkContextSupport.addFiles
import org.apache.spark.SparkFiles
import java.io.File
import water.support.{H2OFrameSupport, SparkContextSupport, ModelMetricsSupport}
import water.Key
import _root_.hex.glm.GLMModel
import _root_.hex.ModelMetricsBinomial
val hc = H2OContext.getOrCreate(sc)
import hc._
import hc.implicits._
addFiles(sc, "/Users/avkashchauhan/learn/deepwater/credit_card_clients.csv")
val creditCardData = new H2OFrame(new File(SparkFiles.get("credit_card_clients.csv")))
val ratios = Array[Double](0.8)
val keys = Array[String]("train.hex", "valid.hex")
val frs = H2OFrameSupport.split(creditCardData, keys, ratios)
val (train, valid) = (frs(0), frs(1))
def buildGLMModel(train: Frame, valid: Frame, response: String)
(implicit h2oContext: H2OContext): GLMModel = {
import _root_.hex.glm.GLMModel.GLMParameters.Family
import _root_.hex.glm.GLM
import _root_.hex.glm.GLMModel.GLMParameters
val glmParams = new GLMParameters(Family.binomial)
glmParams._train = train
glmParams._valid = valid
glmParams._response_column = response
glmParams._alpha = Array[Double](0.5)
val glm = new GLM(glmParams, Key.make("glmModel.hex"))
glm.trainModel().get()
//val glmModel = glm.trainModel().get()
}
val glmModel = buildGLMModel(train, valid, 'default_payment_next_month)(hc)
// Collect model metrics and evaluate model quality
val trainMetrics = ModelMetricsSupport.modelMetrics[ModelMetricsBinomial](glmModel, train)
val validMetrics = ModelMetricsSupport.modelMetrics[ModelMetricsBinomial](glmModel, valid)
println(trainMetrics.rmse)
println(validMetrics.rmse)
println(trainMetrics.mse)
println(validMetrics.mse)
println(trainMetrics.r2)
println(validMetrics.r2)
println(trainMetrics.auc)
println(validMetrics.auc)
// Prediction
addFiles(sc, "/Users/avkashchauhan/learn/deepwater/credit_card_predict.csv")
val creditPredictData = new H2OFrame(new File(SparkFiles.get("credit_card_predict.csv")))
val predictionFrame = glmModel.score(creditPredictData)
var predictonResults = asRDD[DoubleHolder](predictionFrame).collect.map(_.result.getOrElse(Double.NaN))
Thats it, enjoy!