<isol>

@.li:Listing 1: Beispieldaten mit Pilzbestandteilen

@li:+---------+---------+---------+-------+---------+----------+-----------+-----------+
|    class|cap-shape|cap-color|bruises|veil-type|gill-color|ring-number|spore-color|
+---------+---------+---------+-------+---------+----------+-----------+-----------+
|   edible|   convex|   yellow|bruises|  partial|     white|          1|      black|
|   edible|     flat|    brown|   null|  partial|      gray|          1|      black|
|poisonous|   convex|    brown|bruises|  partial|      pink|          1|      brown|
|poisonous|   convex|   yellow|   null|  partial|      pink|          1|       null|
|poisonous|     flat|     gray|bruises|  partial| chocolate|          1|  chocolate|
|   edible|  knobbed|      red|bruises|  partial|     white|          2|      white|
|poisonous|     flat|      red|   null|  partial|     white|          0|      white|
+---------+---------+---------+-------+----------+---------+-----------+-----------+





@.li:Listing 2: <I>ReadDescribe<I>

@li:var df = spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("<path>/mushrooms_ix.csv")

df.cache()

df.describe("class", "cap-shape", "bruises", "ring-number", "spore-color").show()

+-------+---------+---------+-------+---------+------------------+-----------+
|summary|    class|cap-shape|bruises|veil-type|       ring-number|spore-color|
+-------+---------+---------+-------+---------+------------------+-----------+
|  count|     8416|     8416|   3376|     8416|              8416|       7989|
|   mean|     null|     null|   null|     null|1.0655893536121672|       null|
| stddev|     null|     null|   null|     null|0.2696347029978422|       null|
|    min|   edible|     bell|bruises|  partial|                 0|      black|
|    max|poisonous|   sunken|bruises|  partial|                 2|     yellow|
+-------+---------+---------+-------+---------+------------------+-----------+





@.li:Listing 3: <I>Mushroom DS CntDist<I>

@li:df.agg(
  countDistinct("class").alias("class"), 
  $B!D(B
  countDistinct("spore-color").alias("spore-color")
).show()

+-----+---------+---------+-------+----------+---------+-----------+-----------+
|class|cap-shape|cap-color|bruises|gill-color|veil-type|ring-number|spore-color|
+-----+---------+---------+-------+----------+---------+-----------+-----------+
|    2|        6|       10|      1|        12|        1|          3|          9|
+-----+---------+---------+-------+----------+---------+-----------+-----------+





@.li:Listing 4: Pipeline-Aufbau

@li:// Spalte class wird Label, andere Spalten werden Features
var label = "class"
var features = for (col <- df.columns if (col != label)) yield col

// Indexer fr Label
var labelIndexer = new StringIndexer()
                         .setInputCol(label)
                         .setOutputCol("i_"+label) 

// Liste mit Indexern fr Features
var featureIndexers = Array[StringIndexer]()
for (f <- features)
    featureIndexers = featureIndexers :+ new StringIndexer()
                             .setInputCol(f)
                             .setOutputCol("i_"+f)
                             .setHandleInvalid("skip")


// Erzeugung des Feature-Vektors
var featureColumns = featureIndexers.map(f => f.getOutputCol)
var assembler = new VectorAssembler()
                      .setInputCols(featureColumns)
                      .setOutputCol("features")

// automatische Ermittlung kategorischer Features
var catVectorIndexer = new VectorIndexer()
                             .setInputCol(assembler.getOutputCol)
                             .setOutputCol("catFeatures")
                             .setMaxCategories(12)

// Classifier-Objekt erzeugen
var rfClassifier = new RandomForestClassifier()
                         .setLabelCol(labelIndexer.getOutputCol)
                         .setFeaturesCol(catVectorIndexer.getOutputCol)
                         .setPredictionCol("predictedIndex")

// Ermittlung der Labels fr die Rck-Konvertierung
var labels = labelIndexer.fit(df).labels

// Umwandlung der indizierte Werte fr die Prediction in Klartext
var labelConverter = new IndexToString()
                           .setInputCol(rfClassifier.getPredictionCol)
                           .setOutputCol("predictedLabel")
                           .setLabels(labels)





@.li:Listing 5: Pipeline

@li:// Erzeugung der Pipeline
var pipeline = new Pipeline().setStages(
                                Array(labelIndexer) ++
                                featureIndexers :+
                                assembler :+
                                catVectorIndexer :+
                                rfClassifier :+
                                labelConverter)

// Aufteilen in Traings- und Testdaten
var Array(trainingData, testData) = df.randomSplit(Array(0.7, 0.3))

// Erzeugung / Training des Modells
var model = pipeline.fit(trainingData)

// Anwendung des Modells auf die Testdaten
var predictions = model.transform(testData)

// Evaluation
var evaluator = new MulticlassClassificationEvaluator()
                      .setLabelCol(labelIndexer.getOutputCol)
                      .setPredictionCol(rfClassifier.getPredictionCol)
                      .setMetricName("accuracy")

var accuracy = evaluator.evaluate(predictions)

println(f"Accuracy is $accuracy%.3f")

Acurracy is 0.949






@.li:Listing 6:Predictions

@li:predictions
  .select("class", "predictedLabel", "catfeatures", 
          "cap-shape", "cap-color", "bruises")
  .show()

+---------+--------------+--------------------+---------+---------+-------+
|    class|predictedLabel|         catfeatures|cap-shape|cap-color|bruises|
+---------+--------------+--------------------+---------+---------+-------+
|   edible|        edible|[0.0,4.0,1.0,2.0,...|   convex|    white|bruises|
|   edible|        edible|[0.0,3.0,1.0,3.0,...|   convex|   yellow|bruises|
|   edible|        edible|[1.0,0.0,1.0,6.0,...|     flat|    brown|bruises|
|poisonous|     poisonous|       (6,[1],[2.0])|   convex|      red|     no|
|poisonous|        edible|[1.0,0.0,1.0,2.0,...|     flat|    brown|bruises|
|poisonous|        edible|[1.0,4.0,1.0,2.0,...|     flat|    white|bruises|
|poisonous|     poisonous| (6,[0,1],[2.0,2.0])|  knobbed|      red|     no|
+---------+--------------+--------------------+---------+---------+-------+





@.li:Listing 7: Crossvalidation

@li:// Erzeugung der Hyperparameter-Kombinationen
var paramGrid = new ParamGridBuilder()
                      .addGrid(rfClassifier.numTrees, Array(5, 20))
                      .addGrid(rfClassifier.maxDepth, Array(5, 10))
                      .build() 

// CrossValidator erzeugen und parametrieren 
var cv = new CrossValidator()
                .setEstimator(pipeline)
                .setEvaluator(evaluator)
                .setEstimatorParamMaps(paramGrid)
                .setNumFolds(3)    

var cvModel = cv.fit(trainingData)
var cvpredictions = cvModel.transform(testData)
