diff --git a/ds1001_final/notebooks/Final_Project_Notebook.ipynb b/ds1001_final/notebooks/Final_Project_Notebook.ipynb index 8d9b476..c3d8790 100644 --- a/ds1001_final/notebooks/Final_Project_Notebook.ipynb +++ b/ds1001_final/notebooks/Final_Project_Notebook.ipynb @@ -34,7 +34,9 @@ "id": "2f556f44", "metadata": {}, "source": [ - "### You should now be able to open your cloned repo in google collab, use the code below. " + "### You should now be able to open your cloned repo in google collab, use the code below. \n", + "\n", + "### Also it is very helpful to have the variable inspector open while you go through this process. To do so go to tools>command palette>show variable inspector" ] }, { @@ -85,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "cca2a44d", "metadata": {}, "outputs": [], @@ -95,7 +97,6 @@ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import MinMaxScaler\n", - "import mice\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", @@ -140,6 +141,41 @@ "# How many rows are in the dataframe? How many columns?" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "aebf9d93", + "metadata": {}, + "outputs": [], + "source": [ + "# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categorical.\n", + "# Histograms for numeric variables\n", + "numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n", + "for col in numeric_columns: \n", + " plt.figure(figsize=(10, 6))\n", + " sns.histplot(\"xx\"[col], kde=True)\n", + " plt.title(f'Histogram of {col}')\n", + " plt.show() \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1d7469e", + "metadata": {}, + "outputs": [], + "source": [ + "# Bar charts for categorical variables\n", + "categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n", + "for col in categorical_columns:\n", + " plt.figure(figsize=(10, 6))\n", + " sns.countplot(x=\"xx\"[col])\n", + " plt.title(f'Bar Chart of {col}')\n", + " plt.xticks(rotation=45)\n", + " plt.show()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -194,9 +230,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Display missing data using mice for the dataset, which columns have missing data?\n", - "mice.plot_missing(\"xx\")\n", - "plt.show()" + "# Display missing data using the isnull function, is there any missing data?\n", + "print(\"xx\".isnull().sum())" ] }, { @@ -319,21 +354,6 @@ " accuracy = knn_model.score(X_test, y_test)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "e866ae7d", - "metadata": {}, - "outputs": [], - "source": [ - "# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n", - "# predictions on the test set.\n", - "best_k = 'xx' # Replace 'xx' with the best k value found\n", - "knn_model = KNeighborsClassifier(n_neighbors=best_k)\n", - "knn_model.fit(X_train, y_train)\n", - "y_pred = knn_model.predict(X_test) " - ] - }, { "cell_type": "code", "execution_count": null, @@ -352,6 +372,21 @@ "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "e866ae7d", + "metadata": {}, + "outputs": [], + "source": [ + "# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n", + "# predictions on the test set.\n", + "best_k = 'xx' # Replace 'xx' with the best k value found\n", + "knn_model = KNeighborsClassifier(n_neighbors=best_k)\n", + "knn_model.fit(X_train, y_train)\n", + "y_pred = knn_model.predict(X_test) " + ] + }, { "cell_type": "markdown", "id": "3fa4f911", @@ -360,41 +395,6 @@ "## Value: Evaluation and Protected Classes" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "aebf9d93", - "metadata": {}, - "outputs": [], - "source": [ - "# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categoricals.\n", - "# Histograms for numeric variables\n", - "numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n", - "for col in numeric_columns: \n", - " plt.figure(figsize=(10, 6))\n", - " sns.histplot(\"xx\"[col], kde=True)\n", - " plt.title(f'Histogram of {col}')\n", - " plt.show() \n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1d7469e", - "metadata": {}, - "outputs": [], - "source": [ - "# Bar charts for categorical variables\n", - "categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n", - "for col in categorical_columns:\n", - " plt.figure(figsize=(10, 6))\n", - " sns.countplot(x=\"xx\"[col])\n", - " plt.title(f'Bar Chart of {col}')\n", - " plt.xticks(rotation=45)\n", - " plt.show()" - ] - }, { "cell_type": "code", "execution_count": null, @@ -435,14 +435,69 @@ " sensitive_features=X_test[\"xx1\"] # Replace with your first protected class\n", ")\n", "\n", - "# Construct a MetricFrame for sex\n", - "mf_sex = MetricFrame(\n", + "# Construct a MetricFrame for gender\n", + "mf_gender = MetricFrame(\n", " metrics=my_metrics,\n", " y_true=y_test,\n", " y_pred=y_pred,\n", " sensitive_features=X_test[\"xx2\"] # Replace second protected class \n", ") \n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa47711b", + "metadata": {}, + "outputs": [], + "source": [ + "mf_race.by_group #What do the results show? Change the mf_race with each subgroup and report the findings. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdcfd773", + "metadata": {}, + "outputs": [], + "source": [ + "mf_gender.by_group #What do the results show?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0ad32f0", + "metadata": {}, + "outputs": [], + "source": [ + "# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n", + "# two fairness ratios using the gender_m feature. What do the results show, is the model more or less fair with this grouping?\n", + "\n", + "dpr_gender = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n", + "print(\"Demographic Parity ratio:\\t\", dpr_gender)\n", + "\n", + "eodds_gender = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n", + "print(\"Equalized Odds ratio:\\t\\t\", eodds_gender)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d008f7a", + "metadata": {}, + "outputs": [], + "source": [ + "# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n", + "# the same features above only using a filtered search to pull in all the possibilities of features\n", + "# starting with \"race\". What do the results show, is the model more or less fair with this grouping?\n", + "\n", + "dpr_race = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n", + "print(\"Demographic Parity ratio:\\t\", dpr_race)\n", + "\n", + "eodds_race = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n", + "print(\"Equalized Odds ratio:\\t\\t\", eodds_race)" + ] } ], "metadata": {