Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen angezeigt.

--- de:modul:m245:learningunits:lu02:loesungen:l02 [2026/01/05 13:32] – vdemir
+++ de:modul:m245:learningunits:lu02:loesungen:l02 [2026/04/08 08:43] (aktuell) – [Modellvergleich] vdemir
@@ Zeile 3: / Zeile 3: @@
 ===== Voraussetzung =====
-  pip install pandas scikit-learn joblib
+<code python>
+pip install pandas scikit-learn joblib
+</code>
 ===== Python-Skript: ml_basics_shop.py =====
+<code python>
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+import joblib
+# -----------------------------
+# Daten laden
+# -----------------------------
+data = pd.read_csv("shop_data.csv")
+X = data.drop("buy", axis=1)
+y = data["buy"]
-  import pandas as pd
+# -----------------------------
-  from sklearn.model_selection import train_test_split
+# Train / Test Split
-  from sklearn.pipeline import Pipeline
+# -----------------------------
-  from sklearn.preprocessing import StandardScaler
+X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)
-  from sklearn.linear_model import LogisticRegression
-  from sklearn.tree import DecisionTreeClassifier
+# -----------------------------
-  from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+# Modell 1: Logistische Regression
-  import joblib
+# -----------------------------
-  #
+log_reg_pipeline = Pipeline([
-  # -----------------------------
+    ("scaler", StandardScaler()),
-  # Daten laden
+    ("model", LogisticRegression())
-  # -----------------------------
+])
-  data = pd.read_csv("shop_data.csv")
-  #
+log_reg_pipeline.fit(X_train, y_train)
-  X = data.drop("buy", axis=1)
+y_pred_lr = log_reg_pipeline.predict(X_test)
-  y = data["buy"]
+#
-  #
+print("Logistische Regression")
-  # -----------------------------
+print("Accuracy:", accuracy_score(y_test, y_pred_lr))
-  # Train / Test Split
+print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
-  # -----------------------------
+print("Classification Report:\n", classification_report(y_test, y_pred_lr))
-  X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)
-  #
+# -----------------------------
-  # -----------------------------
+# Modell 2: Decision Tree
-  # Modell 1: Logistische Regression
+# -----------------------------
-  # -----------------------------
+tree_model = DecisionTreeClassifier(random_state=42)
-  log_reg_pipeline = Pipeline([
+tree_model.fit(X_train, y_train)
-      ("scaler", StandardScaler()),
+y_pred_tree = tree_model.predict(X_test)
-      ("model", LogisticRegression())
-  ])
+print("\nDecision Tree")
-  #
+print("Accuracy:", accuracy_score(y_test, y_pred_tree))
-  log_reg_pipeline.fit(X_train, y_train)
+print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))
-  y_pred_lr = log_reg_pipeline.predict(X_test)
+print("Classification Report:\n", classification_report(y_test, y_pred_tree))
-  #
-  print("Logistische Regression")
+# -----------------------------
-  print("Accuracy:", accuracy_score(y_test, y_pred_lr))
+# Bestes Modell speichern
-  print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
+# -----------------------------
-  print("Classification Report:\n", classification_report(y_test, y_pred_lr))
+joblib.dump(log_reg_pipeline, "best_model.joblib")
-  #
-  # -----------------------------
+# -----------------------------
-  # Modell 2: Decision Tree
+# Neue Vorhersage
-  # -----------------------------
+# -----------------------------
-  tree_model = DecisionTreeClassifier(random_state=42)
+new_customer = pd.DataFrame([{
-  tree_model.fit(X_train, y_train)
+    "age": 32,
-  y_pred_tree = tree_model.predict(X_test)
+    "past_purchases": 5,
-  #
+    "minutes_on_page": 6.5
-  print("\nDecision Tree")
+}])
-  print("Accuracy:", accuracy_score(y_test, y_pred_tree))
-  print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))
+loaded_model = joblib.load("best_model.joblib")
-  print("Classification Report:\n", classification_report(y_test, y_pred_tree))
+prediction = loaded_model.predict(new_customer)
-  #
-  # -----------------------------
+print("\nVorhersage fuer neuen Kunden:", prediction[0])
-  # Bestes Modell speichern
+</code>
-  # -----------------------------
-  joblib.dump(log_reg_pipeline, "best_model.joblib")
-  #
-  # -----------------------------
-  # Neue Vorhersage
-  # -----------------------------
-  new_customer = pd.DataFrame([{
-      "age": 32,
-      "past_purchases": 5,
-      "minutes_on_page": 6.5
-  }])
-  #
-  loaded_model = joblib.load("best_model.joblib")
-  prediction = loaded_model.predict(new_customer)
-  #
-  print("\nVorhersage fuer neuen Kunden:", prediction[0])
+===== Modellvergleich =====
-===== 3. Modellvergleich =====
 ^ Kriterium ^ Logistische Regression ^ Decision Tree ^
 | Interpretierbarkeit | hoch | mittel |
 | Overfitting-Gefahr | gering | hoch |
-| Skalierung | noetig	ja | nein |
+| Skalierung | nötig	ja | nein |
 | Didaktisch | sinnvoll	sehr | ja |
-Fazit:
+===== Fazit =====
-Bei kleinen, sauberen Datensaetzen ist die Logistische Regression meist stabiler.
+  * Bei kleinen, sauberen Datensätzen ist die Logistische Regression meist stabiler.
-Decision Trees sind anschaulich, aber uebermotiviert – sie merken sich gern alles.
+  * Decision Trees sind anschaulich, aber übermotiviert – sie merken sich gern alles bzw. "lernen einen Fall auswendig".
+----
+[[https://creativecommons.org/licenses/by-nc-sa/4.0/|{{https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png}}]] Volkan Demir