updated the fairness analysis

This commit is contained in:
Brian Wright 2025-11-30 19:15:04 +00:00
parent 93a3363c6a
commit 3a29d02c57

View file

@ -34,7 +34,9 @@
"id": "2f556f44", "id": "2f556f44",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### You should now be able to open your cloned repo in google collab, use the code below. " "### You should now be able to open your cloned repo in google collab, use the code below. \n",
"\n",
"### Also it is very helpful to have the variable inspector open while you go through this process. To do so go to tools>command palette>show variable inspector"
] ]
}, },
{ {
@ -85,7 +87,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"id": "cca2a44d", "id": "cca2a44d",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -95,7 +97,6 @@
"import seaborn as sns\n", "import seaborn as sns\n",
"import matplotlib.pyplot as plt\n", "import matplotlib.pyplot as plt\n",
"from sklearn.preprocessing import MinMaxScaler\n", "from sklearn.preprocessing import MinMaxScaler\n",
"import mice\n",
"from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
@ -140,6 +141,41 @@
"# How many rows are in the dataframe? How many columns?" "# How many rows are in the dataframe? How many columns?"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "aebf9d93",
"metadata": {},
"outputs": [],
"source": [
"# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categorical.\n",
"# Histograms for numeric variables\n",
"numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n",
"for col in numeric_columns: \n",
" plt.figure(figsize=(10, 6))\n",
" sns.histplot(\"xx\"[col], kde=True)\n",
" plt.title(f'Histogram of {col}')\n",
" plt.show() \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1d7469e",
"metadata": {},
"outputs": [],
"source": [
"# Bar charts for categorical variables\n",
"categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n",
"for col in categorical_columns:\n",
" plt.figure(figsize=(10, 6))\n",
" sns.countplot(x=\"xx\"[col])\n",
" plt.title(f'Bar Chart of {col}')\n",
" plt.xticks(rotation=45)\n",
" plt.show()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -194,9 +230,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Display missing data using mice for the dataset, which columns have missing data?\n", "# Display missing data using the isnull function, is there any missing data?\n",
"mice.plot_missing(\"xx\")\n", "print(\"xx\".isnull().sum())"
"plt.show()"
] ]
}, },
{ {
@ -319,21 +354,6 @@
" accuracy = knn_model.score(X_test, y_test)" " accuracy = knn_model.score(X_test, y_test)"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "e866ae7d",
"metadata": {},
"outputs": [],
"source": [
"# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n",
"# predictions on the test set.\n",
"best_k = 'xx' # Replace 'xx' with the best k value found\n",
"knn_model = KNeighborsClassifier(n_neighbors=best_k)\n",
"knn_model.fit(X_train, y_train)\n",
"y_pred = knn_model.predict(X_test) "
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -352,6 +372,21 @@
"plt.show()" "plt.show()"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "e866ae7d",
"metadata": {},
"outputs": [],
"source": [
"# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n",
"# predictions on the test set.\n",
"best_k = 'xx' # Replace 'xx' with the best k value found\n",
"knn_model = KNeighborsClassifier(n_neighbors=best_k)\n",
"knn_model.fit(X_train, y_train)\n",
"y_pred = knn_model.predict(X_test) "
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "3fa4f911", "id": "3fa4f911",
@ -360,41 +395,6 @@
"## Value: Evaluation and Protected Classes" "## Value: Evaluation and Protected Classes"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "aebf9d93",
"metadata": {},
"outputs": [],
"source": [
"# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categoricals.\n",
"# Histograms for numeric variables\n",
"numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n",
"for col in numeric_columns: \n",
" plt.figure(figsize=(10, 6))\n",
" sns.histplot(\"xx\"[col], kde=True)\n",
" plt.title(f'Histogram of {col}')\n",
" plt.show() \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1d7469e",
"metadata": {},
"outputs": [],
"source": [
"# Bar charts for categorical variables\n",
"categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n",
"for col in categorical_columns:\n",
" plt.figure(figsize=(10, 6))\n",
" sns.countplot(x=\"xx\"[col])\n",
" plt.title(f'Bar Chart of {col}')\n",
" plt.xticks(rotation=45)\n",
" plt.show()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -435,14 +435,69 @@
" sensitive_features=X_test[\"xx1\"] # Replace with your first protected class\n", " sensitive_features=X_test[\"xx1\"] # Replace with your first protected class\n",
")\n", ")\n",
"\n", "\n",
"# Construct a MetricFrame for sex\n", "# Construct a MetricFrame for gender\n",
"mf_sex = MetricFrame(\n", "mf_gender = MetricFrame(\n",
" metrics=my_metrics,\n", " metrics=my_metrics,\n",
" y_true=y_test,\n", " y_true=y_test,\n",
" y_pred=y_pred,\n", " y_pred=y_pred,\n",
" sensitive_features=X_test[\"xx2\"] # Replace second protected class \n", " sensitive_features=X_test[\"xx2\"] # Replace second protected class \n",
") \n" ") \n"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa47711b",
"metadata": {},
"outputs": [],
"source": [
"mf_race.by_group #What do the results show? Change the mf_race with each subgroup and report the findings. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cdcfd773",
"metadata": {},
"outputs": [],
"source": [
"mf_gender.by_group #What do the results show?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0ad32f0",
"metadata": {},
"outputs": [],
"source": [
"# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n",
"# two fairness ratios using the gender_m feature. What do the results show, is the model more or less fair with this grouping?\n",
"\n",
"dpr_gender = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n",
"print(\"Demographic Parity ratio:\\t\", dpr_gender)\n",
"\n",
"eodds_gender = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n",
"print(\"Equalized Odds ratio:\\t\\t\", eodds_gender)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d008f7a",
"metadata": {},
"outputs": [],
"source": [
"# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n",
"# the same features above only using a filtered search to pull in all the possibilities of features\n",
"# starting with \"race\". What do the results show, is the model more or less fair with this grouping?\n",
"\n",
"dpr_race = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n",
"print(\"Demographic Parity ratio:\\t\", dpr_race)\n",
"\n",
"eodds_race = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n",
"print(\"Equalized Odds ratio:\\t\\t\", eodds_race)"
]
} }
], ],
"metadata": { "metadata": {