updated the fairness analysis

This commit is contained in:
Brian Wright 2025-11-30 19:15:04 +00:00
parent 93a3363c6a
commit 3a29d02c57

View file

@ -34,7 +34,9 @@
"id": "2f556f44",
"metadata": {},
"source": [
"### You should now be able to open your cloned repo in google collab, use the code below. "
"### You should now be able to open your cloned repo in google collab, use the code below. \n",
"\n",
"### Also it is very helpful to have the variable inspector open while you go through this process. To do so go to tools>command palette>show variable inspector"
]
},
{
@ -85,7 +87,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "cca2a44d",
"metadata": {},
"outputs": [],
@ -95,7 +97,6 @@
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"import mice\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
@ -140,6 +141,41 @@
"# How many rows are in the dataframe? How many columns?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aebf9d93",
"metadata": {},
"outputs": [],
"source": [
"# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categorical.\n",
"# Histograms for numeric variables\n",
"numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n",
"for col in numeric_columns: \n",
" plt.figure(figsize=(10, 6))\n",
" sns.histplot(\"xx\"[col], kde=True)\n",
" plt.title(f'Histogram of {col}')\n",
" plt.show() \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1d7469e",
"metadata": {},
"outputs": [],
"source": [
"# Bar charts for categorical variables\n",
"categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n",
"for col in categorical_columns:\n",
" plt.figure(figsize=(10, 6))\n",
" sns.countplot(x=\"xx\"[col])\n",
" plt.title(f'Bar Chart of {col}')\n",
" plt.xticks(rotation=45)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -194,9 +230,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Display missing data using mice for the dataset, which columns have missing data?\n",
"mice.plot_missing(\"xx\")\n",
"plt.show()"
"# Display missing data using the isnull function, is there any missing data?\n",
"print(\"xx\".isnull().sum())"
]
},
{
@ -319,21 +354,6 @@
" accuracy = knn_model.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e866ae7d",
"metadata": {},
"outputs": [],
"source": [
"# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n",
"# predictions on the test set.\n",
"best_k = 'xx' # Replace 'xx' with the best k value found\n",
"knn_model = KNeighborsClassifier(n_neighbors=best_k)\n",
"knn_model.fit(X_train, y_train)\n",
"y_pred = knn_model.predict(X_test) "
]
},
{
"cell_type": "code",
"execution_count": null,
@ -352,6 +372,21 @@
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e866ae7d",
"metadata": {},
"outputs": [],
"source": [
"# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n",
"# predictions on the test set.\n",
"best_k = 'xx' # Replace 'xx' with the best k value found\n",
"knn_model = KNeighborsClassifier(n_neighbors=best_k)\n",
"knn_model.fit(X_train, y_train)\n",
"y_pred = knn_model.predict(X_test) "
]
},
{
"cell_type": "markdown",
"id": "3fa4f911",
@ -360,41 +395,6 @@
"## Value: Evaluation and Protected Classes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aebf9d93",
"metadata": {},
"outputs": [],
"source": [
"# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categoricals.\n",
"# Histograms for numeric variables\n",
"numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n",
"for col in numeric_columns: \n",
" plt.figure(figsize=(10, 6))\n",
" sns.histplot(\"xx\"[col], kde=True)\n",
" plt.title(f'Histogram of {col}')\n",
" plt.show() \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1d7469e",
"metadata": {},
"outputs": [],
"source": [
"# Bar charts for categorical variables\n",
"categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n",
"for col in categorical_columns:\n",
" plt.figure(figsize=(10, 6))\n",
" sns.countplot(x=\"xx\"[col])\n",
" plt.title(f'Bar Chart of {col}')\n",
" plt.xticks(rotation=45)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -435,14 +435,69 @@
" sensitive_features=X_test[\"xx1\"] # Replace with your first protected class\n",
")\n",
"\n",
"# Construct a MetricFrame for sex\n",
"mf_sex = MetricFrame(\n",
"# Construct a MetricFrame for gender\n",
"mf_gender = MetricFrame(\n",
" metrics=my_metrics,\n",
" y_true=y_test,\n",
" y_pred=y_pred,\n",
" sensitive_features=X_test[\"xx2\"] # Replace second protected class \n",
") \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa47711b",
"metadata": {},
"outputs": [],
"source": [
"mf_race.by_group #What do the results show? Change the mf_race with each subgroup and report the findings. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cdcfd773",
"metadata": {},
"outputs": [],
"source": [
"mf_gender.by_group #What do the results show?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0ad32f0",
"metadata": {},
"outputs": [],
"source": [
"# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n",
"# two fairness ratios using the gender_m feature. What do the results show, is the model more or less fair with this grouping?\n",
"\n",
"dpr_gender = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n",
"print(\"Demographic Parity ratio:\\t\", dpr_gender)\n",
"\n",
"eodds_gender = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n",
"print(\"Equalized Odds ratio:\\t\\t\", eodds_gender)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d008f7a",
"metadata": {},
"outputs": [],
"source": [
"# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n",
"# the same features above only using a filtered search to pull in all the possibilities of features\n",
"# starting with \"race\". What do the results show, is the model more or less fair with this grouping?\n",
"\n",
"dpr_race = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n",
"print(\"Demographic Parity ratio:\\t\", dpr_race)\n",
"\n",
"eodds_race = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n",
"print(\"Equalized Odds ratio:\\t\\t\", eodds_race)"
]
}
],
"metadata": {