mirror of
https://github.com/harivansh-afk/ds1001_final.git
synced 2026-04-15 07:04:46 +00:00
updated the fairness analysis
This commit is contained in:
parent
93a3363c6a
commit
3a29d02c57
1 changed files with 113 additions and 58 deletions
|
|
@ -34,7 +34,9 @@
|
||||||
"id": "2f556f44",
|
"id": "2f556f44",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### You should now be able to open your cloned repo in google collab, use the code below. "
|
"### You should now be able to open your cloned repo in google collab, use the code below. \n",
|
||||||
|
"\n",
|
||||||
|
"### Also it is very helpful to have the variable inspector open while you go through this process. To do so go to tools>command palette>show variable inspector"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -85,7 +87,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
"id": "cca2a44d",
|
"id": "cca2a44d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
|
@ -95,7 +97,6 @@
|
||||||
"import seaborn as sns\n",
|
"import seaborn as sns\n",
|
||||||
"import matplotlib.pyplot as plt\n",
|
"import matplotlib.pyplot as plt\n",
|
||||||
"from sklearn.preprocessing import MinMaxScaler\n",
|
"from sklearn.preprocessing import MinMaxScaler\n",
|
||||||
"import mice\n",
|
|
||||||
"from sklearn.neighbors import KNeighborsClassifier\n",
|
"from sklearn.neighbors import KNeighborsClassifier\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
||||||
|
|
@ -140,6 +141,41 @@
|
||||||
"# How many rows are in the dataframe? How many columns?"
|
"# How many rows are in the dataframe? How many columns?"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "aebf9d93",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categorical.\n",
|
||||||
|
"# Histograms for numeric variables\n",
|
||||||
|
"numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n",
|
||||||
|
"for col in numeric_columns: \n",
|
||||||
|
" plt.figure(figsize=(10, 6))\n",
|
||||||
|
" sns.histplot(\"xx\"[col], kde=True)\n",
|
||||||
|
" plt.title(f'Histogram of {col}')\n",
|
||||||
|
" plt.show() \n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c1d7469e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Bar charts for categorical variables\n",
|
||||||
|
"categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n",
|
||||||
|
"for col in categorical_columns:\n",
|
||||||
|
" plt.figure(figsize=(10, 6))\n",
|
||||||
|
" sns.countplot(x=\"xx\"[col])\n",
|
||||||
|
" plt.title(f'Bar Chart of {col}')\n",
|
||||||
|
" plt.xticks(rotation=45)\n",
|
||||||
|
" plt.show()"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
|
@ -194,9 +230,8 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Display missing data using mice for the dataset, which columns have missing data?\n",
|
"# Display missing data using the isnull function, is there any missing data?\n",
|
||||||
"mice.plot_missing(\"xx\")\n",
|
"print(\"xx\".isnull().sum())"
|
||||||
"plt.show()"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -319,21 +354,6 @@
|
||||||
" accuracy = knn_model.score(X_test, y_test)"
|
" accuracy = knn_model.score(X_test, y_test)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "e866ae7d",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n",
|
|
||||||
"# predictions on the test set.\n",
|
|
||||||
"best_k = 'xx' # Replace 'xx' with the best k value found\n",
|
|
||||||
"knn_model = KNeighborsClassifier(n_neighbors=best_k)\n",
|
|
||||||
"knn_model.fit(X_train, y_train)\n",
|
|
||||||
"y_pred = knn_model.predict(X_test) "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
|
@ -352,6 +372,21 @@
|
||||||
"plt.show()"
|
"plt.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e866ae7d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n",
|
||||||
|
"# predictions on the test set.\n",
|
||||||
|
"best_k = 'xx' # Replace 'xx' with the best k value found\n",
|
||||||
|
"knn_model = KNeighborsClassifier(n_neighbors=best_k)\n",
|
||||||
|
"knn_model.fit(X_train, y_train)\n",
|
||||||
|
"y_pred = knn_model.predict(X_test) "
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "3fa4f911",
|
"id": "3fa4f911",
|
||||||
|
|
@ -360,41 +395,6 @@
|
||||||
"## Value: Evaluation and Protected Classes"
|
"## Value: Evaluation and Protected Classes"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "aebf9d93",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categoricals.\n",
|
|
||||||
"# Histograms for numeric variables\n",
|
|
||||||
"numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n",
|
|
||||||
"for col in numeric_columns: \n",
|
|
||||||
" plt.figure(figsize=(10, 6))\n",
|
|
||||||
" sns.histplot(\"xx\"[col], kde=True)\n",
|
|
||||||
" plt.title(f'Histogram of {col}')\n",
|
|
||||||
" plt.show() \n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "c1d7469e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Bar charts for categorical variables\n",
|
|
||||||
"categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n",
|
|
||||||
"for col in categorical_columns:\n",
|
|
||||||
" plt.figure(figsize=(10, 6))\n",
|
|
||||||
" sns.countplot(x=\"xx\"[col])\n",
|
|
||||||
" plt.title(f'Bar Chart of {col}')\n",
|
|
||||||
" plt.xticks(rotation=45)\n",
|
|
||||||
" plt.show()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
|
@ -435,14 +435,69 @@
|
||||||
" sensitive_features=X_test[\"xx1\"] # Replace with your first protected class\n",
|
" sensitive_features=X_test[\"xx1\"] # Replace with your first protected class\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Construct a MetricFrame for sex\n",
|
"# Construct a MetricFrame for gender\n",
|
||||||
"mf_sex = MetricFrame(\n",
|
"mf_gender = MetricFrame(\n",
|
||||||
" metrics=my_metrics,\n",
|
" metrics=my_metrics,\n",
|
||||||
" y_true=y_test,\n",
|
" y_true=y_test,\n",
|
||||||
" y_pred=y_pred,\n",
|
" y_pred=y_pred,\n",
|
||||||
" sensitive_features=X_test[\"xx2\"] # Replace second protected class \n",
|
" sensitive_features=X_test[\"xx2\"] # Replace second protected class \n",
|
||||||
") \n"
|
") \n"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "aa47711b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"mf_race.by_group #What do the results show? Change the mf_race with each subgroup and report the findings. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "cdcfd773",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"mf_gender.by_group #What do the results show?"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c0ad32f0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n",
|
||||||
|
"# two fairness ratios using the gender_m feature. What do the results show, is the model more or less fair with this grouping?\n",
|
||||||
|
"\n",
|
||||||
|
"dpr_gender = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n",
|
||||||
|
"print(\"Demographic Parity ratio:\\t\", dpr_gender)\n",
|
||||||
|
"\n",
|
||||||
|
"eodds_gender = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test['gender_m'])\n",
|
||||||
|
"print(\"Equalized Odds ratio:\\t\\t\", eodds_gender)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "8d008f7a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Derived fairness metrics. Be sure you understand the scale and meaning of these. Here we are calculating the \n",
|
||||||
|
"# the same features above only using a filtered search to pull in all the possibilities of features\n",
|
||||||
|
"# starting with \"race\". What do the results show, is the model more or less fair with this grouping?\n",
|
||||||
|
"\n",
|
||||||
|
"dpr_race = fairlearn.metrics.demographic_parity_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n",
|
||||||
|
"print(\"Demographic Parity ratio:\\t\", dpr_race)\n",
|
||||||
|
"\n",
|
||||||
|
"eodds_race = fairlearn.metrics.equalized_odds_ratio(y_test, y_pred, sensitive_features=X_test.filter(regex=\"race.*\"))\n",
|
||||||
|
"print(\"Equalized Odds ratio:\\t\\t\", eodds_race)"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue