mirror of
https://github.com/harivansh-afk/ds1001_final.git
synced 2026-04-15 07:04:46 +00:00
updated the fairness analysis
This commit is contained in:
parent
93a3363c6a
commit
3a29d02c57
1 changed files with 113 additions and 58 deletions
|
|
@ -34,7 +34,9 @@
|
|||
"id": "2f556f44",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### You should now be able to open your cloned repo in google collab, use the code below. "
|
||||
"### You should now be able to open your cloned repo in google collab, use the code below. \n",
|
||||
"\n",
|
||||
"### Also it is very helpful to have the variable inspector open while you go through this process. To do so go to tools>command palette>show variable inspector"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -85,7 +87,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"id": "cca2a44d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -95,7 +97,6 @@
|
|||
"import seaborn as sns\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from sklearn.preprocessing import MinMaxScaler\n",
|
||||
"import mice\n",
|
||||
"from sklearn.neighbors import KNeighborsClassifier\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
||||
|
|
@ -140,6 +141,41 @@
|
|||
"# How many rows are in the dataframe? How many columns?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aebf9d93",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categorical.\n",
|
||||
"# Histograms for numeric variables\n",
|
||||
"numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n",
|
||||
"for col in numeric_columns: \n",
|
||||
" plt.figure(figsize=(10, 6))\n",
|
||||
" sns.histplot(\"xx\"[col], kde=True)\n",
|
||||
" plt.title(f'Histogram of {col}')\n",
|
||||
" plt.show() \n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c1d7469e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Bar charts for categorical variables\n",
|
||||
"categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n",
|
||||
"for col in categorical_columns:\n",
|
||||
" plt.figure(figsize=(10, 6))\n",
|
||||
" sns.countplot(x=\"xx\"[col])\n",
|
||||
" plt.title(f'Bar Chart of {col}')\n",
|
||||
" plt.xticks(rotation=45)\n",
|
||||
" plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
@ -194,9 +230,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Display missing data using mice for the dataset, which columns have missing data?\n",
|
||||
"mice.plot_missing(\"xx\")\n",
|
||||
"plt.show()"
|
||||
"# Display missing data using the isnull function, is there any missing data?\n",
|
||||
"print(\"xx\".isnull().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -319,21 +354,6 @@
|
|||
" accuracy = knn_model.score(X_test, y_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e866ae7d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n",
|
||||
"# predictions on the test set.\n",
|
||||
"best_k = 'xx' # Replace 'xx' with the best k value found\n",
|
||||
"knn_model = KNeighborsClassifier(n_neighbors=best_k)\n",
|
||||
"knn_model.fit(X_train, y_train)\n",
|
||||
"y_pred = knn_model.predict(X_test) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
@ -352,6 +372,21 @@
|
|||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e866ae7d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n",
|
||||
"# predictions on the test set.\n",
|
||||
"best_k = 'xx' # Replace 'xx' with the best k value found\n",
|
||||
"knn_model = KNeighborsClassifier(n_neighbors=best_k)\n",
|
||||
"knn_model.fit(X_train, y_train)\n",
|
||||
"y_pred = knn_model.predict(X_test) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3fa4f911",
|
||||
|
|
@ -360,41 +395,6 @@
|
|||
"## Value: Evaluation and Protected Classes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aebf9d93",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categoricals.\n",
|
||||
"# Histograms for numeric variables\n",
|
||||
"numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n",
|
||||
"for col in numeric_columns: \n",
|
||||
" plt.figure(figsize=(10, 6))\n",
|
||||
" sns.histplot(\"xx\"[col], kde=True)\n",
|
||||
" plt.title(f'Histogram of {col}')\n",
|
||||
" plt.show() \n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c1d7469e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Bar charts for categorical variables\n",
|
||||
"categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n",
|
||||
"for col in categorical_columns:\n",
|
||||
" plt.figure(figsize=(10, 6))\n",
|
||||
" sns.countplot(x=\"xx\"[col])\n",
|
||||
" plt.title(f'Bar Chart of {col}')\n",
|
||||
" plt.xticks(rotation=45)\n",
|
||||
" plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
@ -435,14 +435,69 @@
|
|||
" sensitive_features=X_test[\"xx1\"] # Replace with your first protected class\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Construct a MetricFrame for sex\n",
|
||||
"mf_sex = MetricFrame(\n",
|
||||
"# Construct a MetricFrame for gender\n",
|
||||
"mf_gender = MetricFrame(\n",
|
||||
" metrics=my_metrics,\n",
|
||||
" y_true=y_test,\n",
|
||||
" y_pred=y_pred,\n",
|
||||
" sensitive_features=X_test[\"xx2\"] # Replace second protected class \n",
|
||||
") \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aa47711b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mf_race.by_group #What do the results show? Change the mf_race with each subgroup and report the findings. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cdcfd773",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mf_gender.by_group #What do the results show?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c0ad32f0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n",
|
||||
"# two fairness ratios using the gender_m feature. What do the results show, is the model more or less fair with this grouping?\n",
|
||||
"\n",
|
||||
"dpr_gender = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n",
|
||||
"print(\"Demographic Parity ratio:\\t\", dpr_gender)\n",
|
||||
"\n",
|
||||
"eodds_gender = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n",
|
||||
"print(\"Equalized Odds ratio:\\t\\t\", eodds_gender)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8d008f7a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n",
|
||||
"# the same features above only using a filtered search to pull in all the possibilities of features\n",
|
||||
"# starting with \"race\". What do the results show, is the model more or less fair with this grouping?\n",
|
||||
"\n",
|
||||
"dpr_race = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n",
|
||||
"print(\"Demographic Parity ratio:\\t\", dpr_race)\n",
|
||||
"\n",
|
||||
"eodds_race = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n",
|
||||
"print(\"Equalized Odds ratio:\\t\\t\", eodds_race)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue