updated the fairness analysis

2026-04-15 07:04:46 +00:00 · 2025-11-30 19:15:04 +00:00 · 2025-11-30 19:15:04 +00:00 · 3a29d02c57
commit 3a29d02c57
parent 93a3363c6a
1 changed files with 113 additions and 58 deletions
--- a/ds1001_final/notebooks/Final_Project_Notebook.ipynb
+++ b/ds1001_final/notebooks/Final_Project_Notebook.ipynb
@ -34,7 +34,9 @@
   "id": "2f556f44",
   "metadata": {},
   "source": [
-    "### You should now be able to open your cloned repo in google collab, use the code below. "
+    "### You should now be able to open your cloned repo in google collab, use the code below. \n",
    "\n",
    "### Also it is very helpful to have the variable inspector open while you go through this process. To do so go to tools>command palette>show variable inspector"
   ]
  },
  {
@ -85,7 +87,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "id": "cca2a44d",
   "metadata": {},
   "outputs": [],
@ -95,7 +97,6 @@
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "import mice\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
@ -140,6 +141,41 @@
    "# How many rows are in the dataframe? How many columns?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aebf9d93",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categorical.\n",
    "# Histograms for numeric variables\n",
    "numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n",
    "for col in numeric_columns: \n",
    "    plt.figure(figsize=(10, 6))\n",
    "    sns.histplot(\"xx\"[col], kde=True)\n",
    "    plt.title(f'Histogram of {col}')\n",
    "    plt.show()  \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1d7469e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Bar charts for categorical variables\n",
    "categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n",
    "for col in categorical_columns:\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    sns.countplot(x=\"xx\"[col])\n",
    "    plt.title(f'Bar Chart of {col}')\n",
    "    plt.xticks(rotation=45)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -194,9 +230,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Display missing data using mice for the dataset, which columns have missing data?\n",
+    "# Display missing data using the isnull function, is there any missing data?\n",
-    "mice.plot_missing(\"xx\")\n",
+    "print(\"xx\".isnull().sum())"
    "plt.show()"
   ]
  },
  {
@ -319,21 +354,6 @@
    "    accuracy = knn_model.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e866ae7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n",
    "# predictions on the test set.\n",
    "best_k = 'xx'  # Replace 'xx' with the best k value found\n",
    "knn_model = KNeighborsClassifier(n_neighbors=best_k)\n",
    "knn_model.fit(X_train, y_train)\n",
    "y_pred = knn_model.predict(X_test)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -352,6 +372,21 @@
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e866ae7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n",
    "# predictions on the test set.\n",
    "best_k = 'xx'  # Replace 'xx' with the best k value found\n",
    "knn_model = KNeighborsClassifier(n_neighbors=best_k)\n",
    "knn_model.fit(X_train, y_train)\n",
    "y_pred = knn_model.predict(X_test)  "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3fa4f911",
@ -360,41 +395,6 @@
    "## Value: Evaluation and Protected Classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aebf9d93",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categoricals.\n",
    "# Histograms for numeric variables\n",
    "numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n",
    "for col in numeric_columns: \n",
    "    plt.figure(figsize=(10, 6))\n",
    "    sns.histplot(\"xx\"[col], kde=True)\n",
    "    plt.title(f'Histogram of {col}')\n",
    "    plt.show()  \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1d7469e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Bar charts for categorical variables\n",
    "categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n",
    "for col in categorical_columns:\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    sns.countplot(x=\"xx\"[col])\n",
    "    plt.title(f'Bar Chart of {col}')\n",
    "    plt.xticks(rotation=45)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -435,14 +435,69 @@
    "    sensitive_features=X_test[\"xx1\"]  # Replace with your first protected class\n",
    ")\n",
    "\n",
-    "# Construct a MetricFrame for sex\n",
+    "# Construct a MetricFrame for gender\n",
-    "mf_sex = MetricFrame(\n",
+    "mf_gender = MetricFrame(\n",
    "    metrics=my_metrics,\n",
    "    y_true=y_test,\n",
    "    y_pred=y_pred,\n",
    "    sensitive_features=X_test[\"xx2\"]  # Replace second protected class \n",
    ")  \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa47711b",
   "metadata": {},
   "outputs": [],
   "source": [
    "mf_race.by_group #What do the results show? Change the mf_race with each subgroup and report the findings. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdcfd773",
   "metadata": {},
   "outputs": [],
   "source": [
    "mf_gender.by_group #What do the results show?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0ad32f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n",
    "# two fairness ratios using the gender_m feature. What do the results show, is the model more or less fair with this grouping?\n",
    "\n",
    "dpr_gender = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n",
    "print(\"Demographic Parity ratio:\\t\", dpr_gender)\n",
    "\n",
    "eodds_gender = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n",
    "print(\"Equalized Odds ratio:\\t\\t\", eodds_gender)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d008f7a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n",
    "# the same features above only using a filtered search to pull in all the possibilities of features\n",
    "# starting with \"race\". What do the results show, is the model more or less fair with this grouping?\n",
    "\n",
    "dpr_race = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n",
    "print(\"Demographic Parity ratio:\\t\", dpr_race)\n",
    "\n",
    "eodds_race = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n",
    "print(\"Equalized Odds ratio:\\t\\t\", eodds_race)"
   ]
  }
 ],
 "metadata": {