Listing 1: Nach Stoppwörtern filterndef text_cleaner(sentence, nlp = English()):
    """
    Tokenizes text, removes punctuations and stopwords.
    
    Input:
     sentence: string containing words
     nlp: spacy Language-Object, default: English()
    Output:
    clean_doc: clean tokenstring, with no stopwords, punctuation, double spaces etc.
    """
    # Create the Doc object named `text` from `sentence` using `nlp()`
    doc = nlp(sentence)
    #remove punctuation
    doc_no_punct = [token for token in doc if token.is_punct == False]
    # Create list of word tokens
    doc_token_list = [token.text for token in doc_no_punct]
    #remove stopwords
    stopwords = nlp.Defaults.stop_words
    doc_no_stopwords = [token for token in doc_token_list if token not in stopwords] 
    doc_no_stopwords = " ".join(doc_no_stopwords)
    # re.sub() to substitute multiple spaces or dots to single space and remove gt;
    doc_no_stopwords = re.sub('gt;','',doc_no_stopwords)
    clean_doc  = re.sub('[\.\s]+', ' ', doc_no_stopwords)
    
    return clean_doc


Listing 2: sci-kit learns cross_validatefrom sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate

#Instanzieren des Modells
clf = MultinomialNB()

#Kreuzvalidierung (5-Fach) der BoW-Methode
cv_results_bow = cross_validate(estimator = clf,
                                X=features_bow_microsoft,
                                y=target_microsoft,           
                                scoring=["precision_weighted",
                                         "recall_weighted",
                                          "f1_weighted"])

#Kreuzvalidierung (5-Fach) der TF-IDF-Method
cv_results_tfidf = cross_validate(estimator = clf,
                                  X=features_tfidf_microsoft,
                                  y=target_microsoft,
                                  scoring=["precision_weighted",
                                           "recall_weighted",
                                           "f1_weighted"])

#Errechnung der durchschnittlichen Werte für jede Metrik
#(über alle Kreuzvalidierungen hinweg)
bow_precision = cv_results_bow["test_precision_weighted"].mean()
bow_recall = cv_results_bow["test_recall_weighted"].mean()
bow_f1 = cv_results_bow["test_f1_weighted"].mean()

tfidf_precision = cv_results_tfidf["test_precision_weighted"].mean()
tfidf_recall = cv_results_tfidf["test_recall_weighted"].mean()
tfidf_f1 = cv_results_tfidf["test_f1_weighted"].mean()


Listing 3: GridSearchCVfrom sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
#Erstellen der Pipelines für beide Methoden
bow_pipeline = Pipeline([('vect', CountVectorizer()), 
                                              ('clf', MultinomialNB())])
tfidf_pipeline = Pipeline([('vect', TfidfVectorizer()),
                                             ('clf', MultinomialNB())])
#Festlegen der zu testenden Parameter
param_search_space = {'vect__ngram_range': ((1, 1), (1, 2),(2,2),(1,3)),
                                            'vect__max_features': (None, 5000, 10000, 25000)}
#Erstellen des GridSearchCV Objektes für die BoW Pipeline und den Parameterraum
bow_grid_search = GridSearchCV(estimator = bow_pipeline,
                                                             param_grid = param_search_space,
                                                            scoring="f1_weighted", cv=5,  n_jobs = -1)
#Modelfitting um die beste Parameterkombination zu bestimmen
bow_grid_search.fit(features_microsoft,target_microsoft)
bow_best_params = bow_grid_search.best_estimator_.get_params()
#Erstellen des GridSearchCV Objektes für die TF-IDF Pipeline und den Parameterraum
tfidf_grid_search = GridSearchCV(estimator = tfidf_pipeline, 
                                                            param_grid = param_search_space,
                                                            scoring= "f1_weighted", cv=5, n_jobs = -1)
#Modelfitting um die beste Parameterkombination zu bestimmen
tfidf_grid_search.fit(features_microsoft,target_microsoft)
tfidf_best_params = tfidf_grid_search.best_estimator_.get_params()


