-- ---------------------------------
-- Beispielcode zum iX Artikel
-- ---------------------------------

-- ---------------------------------
-- Create Extension
-- ---------------------------------

CREATE EXTENSION plpython3u;

-- ----------------------------------
-- PL/Python UDF ml.predict
-- ----------------------------------

--drop function if exists ml.predict(m bytea, fv double precision[]);

CREATE OR REPLACE FUNCTION ml.predict(m bytea, fv double precision[])
RETURNS double precision
AS $$
import numpy as np
import pickle
clf = pickle.loads(m)
return clf.predict(np.array(fv).reshape(1,-1))[0]
$$ LANGUAGE plpython3u
RETURNS NULL ON NULL INPUT;

-- ----------------------------------
-- Daten Analyse
-- ----------------------------------

SELECT *
FROM ml.mushroom m;

/*
SELECT
 ',COUNT(DISTINCT ' || column_name || ') AS cnt_dist_' || column_name AS cnt_dist
,',COUNT(' || column_name || ') AS cnt_' || column_name AS cnt
FROM information_schema.columns
 WHERE table_schema = 'ml'
   AND table_name   = 'mushroom'
ORDER BY ordinal_position ;
*/


SELECT
	 COUNT(*) cnt
	,COUNT(bruises) AS cnt_bruises
	,COUNT(DISTINCT bruises) AS cnt_dist_bruises
	,COUNT(veil_type) AS cnt_veil_type
	,COUNT(DISTINCT veil_type) AS cnt_dist_veil_type
	,COUNT(spore_color) AS cnt_spore_color
	,COUNT(DISTINCT spore_color) AS cnt_dist_spore_color
FROM ml.mushroom;

-- ----------------------------------
-- Daten Aufbereitung
-- ----------------------------------

--DROP VIEW IF EXISTS ml.mushroom_prep;

CREATE OR REPLACE VIEW ml.mushroom_prep
AS
WITH seed
AS
(
	SELECT setseed(1)
)
SELECT
	 percent_rank() OVER(ORDER BY random() ASC) AS pct_rnk
	,m.class
	,ARRAY[
	 	 m.cap_shape
		,m.cap_color
		,CASE WHEN m.bruises IS NULL THEN 'no' ELSE bruises END
		,m.gill_color
		,m.ring_number
		,m.spore_color
	] AS featurevector
FROM ml.mushroom m
CROSS JOIN seed d
WHERE m.spore_color IS NOT NULL;

SELECT *
FROM ml.mushroom_prep;

-- ----------------------------------
-- Indexierung von Label und
-- Featurevektor
-- ----------------------------------

-- ----------------------------------
-- sklearn LabelEncoder UDF
-- ----------------------------------

--DROP FUNCTION IF EXISTS ml.labelencoder(l varchar[]);

CREATE OR REPLACE FUNCTION ml.labelencoder(l varchar[])
RETURNS bytea
AS $$
from sklearn import preprocessing
import pickle
le = preprocessing.LabelEncoder()
le.fit(l)
return pickle.dumps(le)
$$ LANGUAGE plpython3u
RETURNS NULL ON NULL INPUT;

-- ----------------------------------
-- sklearn OneHotEncoder UDF
-- ----------------------------------

--DROP FUNCTION IF EXISTS ml.onehotencoder(fv varchar[]);

CREATE OR REPLACE FUNCTION ml.onehotencoder(fv varchar[])
RETURNS bytea
AS $$
from sklearn.preprocessing import OneHotEncoder
import pickle
e = OneHotEncoder(sparse=False)
e.fit(fv)
return pickle.dumps(e)
$$ LANGUAGE plpython3u
RETURNS NULL ON NULL INPUT;

-- ----------------------------------
-- sklearn Encode UDF
-- overloaded
-- ----------------------------------

--DROP FUNCTION IF EXISTS ml.encode(e bytea, v varchar) CASCADE;

CREATE OR REPLACE FUNCTION ml.encode(e bytea, v varchar)
RETURNS double precision
AS $$
import pickle
return pickle.loads(e).transform([v])[0]
$$ LANGUAGE plpython3u
RETURNS NULL ON NULL INPUT;

--DROP FUNCTION IF EXISTS ml.encode(e bytea, v varchar[]) CASCADE;

CREATE OR REPLACE FUNCTION ml.encode(e bytea, v varchar[])
RETURNS double precision[]
AS $$
import pickle
return pickle.loads(e).transform([v])[0]
$$ LANGUAGE plpython3u
RETURNS NULL ON NULL INPUT;

-- ----------------------------------
-- SQL zur Indexierung von Label
-- und Featurevektor
-- ----------------------------------

WITH data
AS
(
	SELECT *
	FROM ml.mushroom_prep 
), enc
AS
(
	SELECT
		 ml.labelencoder(array_agg(DISTINCT class)) AS le
		,ml.onehotencoder(array_agg(DISTINCT featurevector)) AS fve 
	FROM data
)
SELECT
	 pct_rnk
	,d.class, ml.encode(e.le, d.class) AS i_class
	,d.featurevector, ml.encode(e.fve, d.featurevector) AS i_featurevector
FROM data d
CROSS JOIN enc e;

-- ----------------------------------
-- Decision Tree und Logistic
-- Regression Klassifikatoren
-- ----------------------------------

-- ----------------------------------
-- sklearn Decision Tree UDF
-- ----------------------------------

--DROP FUNCTION IF EXISTS ml.dt_model(x double precision[][], y double precision[]);

CREATE OR REPLACE FUNCTION ml.dt_model(x double precision[][], y double precision[])
RETURNS bytea
AS $$
from sklearn import tree
import pickle
clf = tree.DecisionTreeClassifier().fit(x, y)
return pickle.dumps(clf)
$$ LANGUAGE plpython3u
RETURNS NULL ON NULL INPUT;

-- ----------------------------------
-- sklearn Logistic Regression UDF
-- ----------------------------------

--DROP FUNCTION IF EXISTS ml.lr_model(x double precision[][], y double precision[]);

CREATE OR REPLACE FUNCTION ml.lr_model(x double precision[][], y double precision[])
RETURNS bytea
AS $$
from sklearn.linear_model import LogisticRegression
import pickle
clf = LogisticRegression(multi_class='ovr').fit(x, y)
return pickle.dumps(clf)
$$ LANGUAGE plpython3u
RETURNS NULL ON NULL INPUT;

-- ----------------------------------
-- Encoder Persistenz
-- ----------------------------------

WITH e
AS
(
	SELECT
		 'LabelEncoder' AS model_name
		,'sklearn.preprocessing.LabelEncoder' AS algorithm
		,ml.labelencoder(array_agg(DISTINCT class)) AS model 
	FROM ml.mushroom_prep
	UNION ALL 
	SELECT
		 'OneHotEncoder' AS model_name
		,'sklearn.preprocessing.OneHotEncoder' AS algorithm
		,ml.onehotencoder(array_agg(DISTINCT featurevector)) AS model 
	FROM ml.mushroom_prep
)
INSERT INTO ml.model (task, model_name, model_type, algorithm, model)
SELECT 'Mushroom Classification' AS task, model_name, 'Preprocessing' AS model_type, algorithm, model 
FROM e;

-- ----------------------------------
-- Klassifikator Modell Persistenz
-- ----------------------------------

WITH train
AS
(
	SELECT
		 class
		,featurevector
	FROM ml.mushroom_prep
	WHERE pct_rnk <= 0.80
), model
AS
(
	SELECT
		 ml.get_model('Mushroom Classification', 'LabelEncoder') AS le
		,ml.get_model('Mushroom Classification', 'OneHotEncoder') AS fve
), enc
AS 
(
	SELECT 
		 array_agg(ml.encode(m.le, d.class)) AS i_classes
		,array_agg(ml.encode(m.fve, d.featurevector)) AS i_fvects
	FROM train d
	CROSS JOIN model m
), classifier
AS 
(
	SELECT
		 'DecisionTreeClassifier' AS model_name
		,'Tree' AS model_type
		,'sklearn.tree.DecisionTreeClassifier' AS algorithm
		,ml.dt_model(i_fvects, i_classes) AS model 
	FROM enc
	UNION ALL 
	SELECT
		 'LogisticRegression' AS model_name
		,'Linear_Model' AS model_type
		,'sklearn.linear_model.LogisticRegression' AS algorithm
		,ml.lr_model(i_fvects, i_classes) AS model
	FROM enc
)
INSERT INTO ml.model (task, model_name, model_type, algorithm, model)
SELECT 'Mushroom Classification' AS task, model_name, model_type, algorithm, model
FROM classifier;

SELECT *
FROM ml.model;

SELECT ml.get_model('Mushroom Classification', 'DecisionTreeClassifier');

-- ----------------------------------
-- Modell Evaluation
-- ----------------------------------

-- ----------------------------------
-- sklearn Score UDF
-- ----------------------------------

--DROP FUNCTION IF EXISTS ml.score(m bytea, x double precision[][], y double precision[]);

CREATE OR REPLACE FUNCTION ml.score(m bytea, x double precision[][], y double precision[])
RETURNS double precision
AS $$
import pickle
return pickle.loads(m).score(x, y)
$$ LANGUAGE plpython3u
RETURNS NULL ON NULL INPUT;

-- ----------------------------------
-- SQL zur Bewertung der
-- Klassifikatoren
-- ----------------------------------

WITH test
AS
(
	SELECT
		 class
		,featurevector
	FROM ml.mushroom_prep
	WHERE pct_rnk > 0.80
), model
AS
(
	SELECT
		 ml.get_model('Mushroom Classification', 'LabelEncoder') AS le
		,ml.get_model('Mushroom Classification', 'OneHotEncoder') AS fve
		,ml.get_model('Mushroom Classification', 'DecisionTreeClassifier') AS dtc
		,ml.get_model('Mushroom Classification', 'LogisticRegression') AS lrc
)
SELECT
	 'DecisionTreeClassifier' AS model_name
	,ml.score(
		 m.dtc
		,array_agg(ml.encode(m.fve, d.featurevector))
		,array_agg(ml.encode(m.le, d.class))
	) AS precision 
FROM test d
CROSS JOIN model m
GROUP BY m.dtc
UNION ALL 
SELECT
	 'LogisticRegression' AS model_name
	,ml.score(
		 m.lrc
		,array_agg(ml.encode(m.fve, d.featurevector))
		,array_agg(ml.encode(m.le, d.class))
	) AS precision 
FROM test d
CROSS JOIN model m
GROUP BY m.lrc;

-- ----------------------------------
-- Modell Anwendung
-- ----------------------------------

-- ----------------------------------
-- sklearn Inverse UDF
-- ----------------------------------

--DROP FUNCTION IF EXISTS ml.inverse(e bytea, v double precision) CASCADE;

CREATE OR REPLACE FUNCTION ml.inverse(e bytea, v integer)
RETURNS varchar
AS $$
import pickle
return pickle.loads(e).inverse_transform([v])[0]
$$ LANGUAGE plpython3u
RETURNS NULL ON NULL INPUT;

-- ----------------------------------
-- SQL zur Anwendung des
-- Decision Tree Klassifikators
-- ----------------------------------

WITH pred
AS
(
	SELECT
		 class
		,featurevector
	FROM ml.mushroom_prep
	WHERE pct_rnk > 0.80
	LIMIT 2
), model
AS
(
	SELECT
		 ml.get_model('Mushroom Classification', 'LabelEncoder') AS le
		,ml.get_model('Mushroom Classification', 'OneHotEncoder') AS fve
		,ml.get_model('Mushroom Classification', 'DecisionTreeClassifier') AS dtc
)
SELECT
	 d.class
	,ml.inverse(m.le, ml.predict(m.dtc, ml.encode(m.fve, d.featurevector))::integer) AS prediction
	,d.featurevector
FROM pred d
CROSS JOIN model m;