From 675949cbe156ede65e9fc1f2dabb2b3caa6d5c1d Mon Sep 17 00:00:00 2001 From: Brian Wright Date: Mon, 24 Nov 2025 19:06:43 +0000 Subject: [PATCH] updated final almost done --- Final_Project_Notebook.ipynb | 408 ++++++++++++++++++++++++++++++----- requirements.txt | 5 +- 2 files changed, 356 insertions(+), 57 deletions(-) diff --git a/Final_Project_Notebook.ipynb b/Final_Project_Notebook.ipynb index df24da4..72b40f8 100644 --- a/Final_Project_Notebook.ipynb +++ b/Final_Project_Notebook.ipynb @@ -7,7 +7,42 @@ "source": [ "# Final Project Notebook\n", "\n", - "Use the follow cells prompts to complete the final project for the course. Everything you need should be present in this notebook or previous notebooks we've used in class. You can work together as needed. " + "Use the follow cells prompts to complete the final project for the course. Everything you need should be present in this notebook or previous notebooks we've used in class. You can work together as needed. \n", + "\n", + " - You will need to name your own dataset and use that name throughout\n", + " - There are sections where you need to make changes the code and insert new code this will be noted in the code provided\n", + " - You may get frustrated along the way, this is totally normal, just remember even small changes to the code make a huge difference. " + ] + }, + { + "cell_type": "markdown", + "id": "d45a3b8d", + "metadata": {}, + "source": [ + "## Question Fork the Repository\n", + "i. Include a screenshot of the forked repo in your GitHub account\n", + "\n", + "To fork the repository:\n", + "1. Go to https://github.com/NovaVolunteer/ds1001_final\n", + "2. Click the \"Fork\" button in the top right corner\n", + "3. The repo will be forked to your GitHub account\n", + "4. Take a screenshot of your forked repository" + ] + }, + { + "cell_type": "markdown", + "id": "2f556f44", + "metadata": {}, + "source": [ + "### You should now be able to open the new repo in Google Collab " + ] + }, + { + "cell_type": "markdown", + "id": "49e14c0f", + "metadata": {}, + "source": [ + "## Systems" ] }, { @@ -17,7 +52,9 @@ "outputs": [], "source": [ "#Run the requirements file to load the required packages\n", - "!pip install -r requirements.txt" + "!pip install -r requirements.txt\n", + "#Are there additional packages to install? (Cross check with the list below to \n", + "# ensure all packages are installed)" ] }, { @@ -31,56 +68,14 @@ "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", - "from sklearn.preprocessing import MinMaxScaler" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "95149c9b", - "metadata": { - "vscode": { - "languageId": "markdown" - } - }, - "outputs": [], - "source": [ - "## Systems" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5707e81a", - "metadata": { - "vscode": { - "languageId": "markdown" - } - }, - "outputs": [], - "source": [ - "### Question Fork the Repository\n", - "i. Include a screenshot of the forked repo in your GitHub account\n", - "\n", - "To fork the repository:\n", - "1. Go to https://github.com/NovaVolunteer/ds1001_final\n", - "2. Click the \"Fork\" button in the top right corner\n", - "3. The repo will be forked to your GitHub account\n", - "4. Take a screenshot of your forked repository\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3cde0ef0", - "metadata": { - "vscode": { - "languageId": "markdown" - } - }, - "outputs": [], - "source": [ - "### You should now be able to open the new repo in Google Collab " + "from sklearn.preprocessing import MinMaxScaler\n", + "import mice\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", + "import fairlearn.metrics\n", + "from fairlearn.metrics import MetricFrame\n", + "from fairlearn.metrics import count, true_positive_rate, false_positive_rate, selection_rate, demographic_parity_ratio\n" ] }, { @@ -121,6 +116,14 @@ "#This will show you all installed packages that have newer versions available, displaying both the current version and the latest version." ] }, + { + "cell_type": "markdown", + "id": "7369da7c", + "metadata": {}, + "source": [ + "## Design: Data prep and exploration " + ] + }, { "cell_type": "code", "execution_count": null, @@ -128,9 +131,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('your_dataset.csv')\n", + "\"name your dataset\" = pd.read_csv('your_dataset.csv')\n", "\n", - "#a. How many rows are in the dataframe? How many columns?" + "# How many rows are in the dataframe? How many columns?" ] }, { @@ -140,8 +143,301 @@ "metadata": {}, "outputs": [], "source": [ - "#b. How many numeric columns are in the data set?\n", - "num_numeric_columns = df.select_dtypes(include=['number']).shape[1]" + "# How many numeric columns are in the data set?\n", + "num_numeric_columns = \"xx\".select_dtypes(include=['number']).shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e76a5f9e", + "metadata": {}, + "outputs": [], + "source": [ + "# Normalization\n", + "scaler = MinMaxScaler()\n", + "\"xx\"[\"xx\".select_dtypes(include=['number']).columns] = scaler.fit_transform(\"xx\".select_dtypes(include=['number']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9875873", + "metadata": {}, + "outputs": [], + "source": [ + "# Likely need to convert categorical columns to category dtype\n", + "for col in \"xx\".select_dtypes(include=['object']).columns:\n", + " \"xx\"[col] = \"xx\"[col].astype('category') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80626fad", + "metadata": {}, + "outputs": [], + "source": [ + "# Creating dummy variables, make sure the variables that need to be converted to dummies are categorical, not numeric.\n", + "# This might require you to convert some columns to categorical first using astype('category')\n", + "\"xx\" = pd.get_dummies(\"xx\", drop_first=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f39eb5f0", + "metadata": {}, + "outputs": [], + "source": [ + "# Display missing data using mice for the dataset, which columns have missing data?\n", + "mice.plot_missing(\"xx\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26ae0ed6", + "metadata": {}, + "outputs": [], + "source": [ + "# remove missing values if needed\n", + "\"xx\" = \"xx\".dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acd5b69f", + "metadata": {}, + "outputs": [], + "source": [ + "# Scatterplot between two variables\n", + "plt.figure(figsize=(10, 6))\n", + "sns.scatterplot(x='Variable1', y='Variable2', data=\"xx\") # Replace 'Variable1' and 'Variable2' with your column names\n", + "plt.title('Scatterplot of Variable1 vs Variable2')\n", + "plt.savefig('scatterplot.png') # Save the scatterplot image\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4889270f", + "metadata": {}, + "outputs": [], + "source": [ + "# Density chart of a continuous variable\n", + "plt.figure(figsize=(10, 6))\n", + "sns.kdeplot(\"xx\"['ContinuousVariable'], fill=True) # Replace 'ContinuousVariable' with your column name\n", + "plt.title('Density Chart of ContinuousVariable')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9131b326", + "metadata": {}, + "outputs": [], + "source": [ + "#Correlation matrix, make sure to only include numeric variables\n", + "num_values = \"xx\".select_dtypes(include=['number'])\n", + "correlation_matrix = num_values.corr()\n", + "plt.figure(figsize=(12, 8))\n", + "sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)\n", + "plt.title('Correlation Matrix')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "7072676b", + "metadata": {}, + "source": [ + "## Analytics: Build a model and Tune it for best Best Performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5d16e93", + "metadata": {}, + "outputs": [], + "source": [ + "# What is the ‘target’ of a model and what is the prevalence of the target in your dataset? Remember prevalence \n", + "# is the proportion of records that take on the value of interest for the target variable, usually the positive class.\n", + "target_prevalence = \"xx\"['TargetVariable'].sum() # Replace 'TargetVariable' with your target column name\n", + "print(f'Target Prevalence: {target_prevalence}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6f008fe", + "metadata": { + "vscode": { + "languageId": "markdown" + } + }, + "outputs": [], + "source": [ + "# Divide the dataset into features and target\n", + "target = \"xx\"['TargetVariable'] # Replace 'TargetVariable' with your actual target column name\n", + "features = \"xx\".drop(columns=[target])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e511f88", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c995e33", + "metadata": {}, + "outputs": [], + "source": [ + "# Include your table for the 10 values of k you tried and the corresponding accuracies.\n", + "\n", + "accuracy_results = {}\n", + "\n", + "for k in range(x, x): # Replace x with your desired range values\n", + " knn_model = KNeighborsClassifier(n_neighbors=k)\n", + " knn_model.fit(X_train, y_train)\n", + " accuracy = knn_model.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e866ae7d", + "metadata": {}, + "outputs": [], + "source": [ + "# using the hyperparameter k that gave the best accuracy, rerun the model and generate \n", + "# predictions on the test set.\n", + "best_k = 'xx' # Replace 'xx' with the best k value found\n", + "knn_model = KNeighborsClassifier(n_neighbors=best_k)\n", + "knn_model.fit(X_train, y_train)\n", + "y_pred = knn_model.predict(X_test) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "366f9e5a", + "metadata": {}, + "outputs": [], + "source": [ + "#graph of accuracy vs k values\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(list(accuracy_results.keys()), list(accuracy_results.values()), marker='o')\n", + "plt.title('KNN Accuracy vs K Values')\n", + "plt.xlabel('Number of Neighbors (k)')\n", + "plt.ylabel('Accuracy')\n", + "plt.xticks(list(accuracy_results.keys()))\n", + "plt.grid()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "3fa4f911", + "metadata": {}, + "source": [ + "## Value: Evaluation and Protected Classes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aebf9d93", + "metadata": {}, + "outputs": [], + "source": [ + "# Explore the variables a bit more, create histograms for the numerics values and bar charts for the categoricals.\n", + "# Histograms for numeric variables\n", + "numeric_columns = \"xx\".select_dtypes(include=['number']).columns\n", + "for col in numeric_columns: \n", + " plt.figure(figsize=(10, 6))\n", + " sns.histplot(\"xx\"[col], kde=True)\n", + " plt.title(f'Histogram of {col}')\n", + " plt.show() \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1d7469e", + "metadata": {}, + "outputs": [], + "source": [ + "# Bar charts for categorical variables\n", + "categorical_columns = \"xx\".select_dtypes(include=['object', 'category']).columns\n", + "for col in categorical_columns:\n", + " plt.figure(figsize=(10, 6))\n", + " sns.countplot(x=\"xx\"[col])\n", + " plt.title(f'Bar Chart of {col}')\n", + " plt.xticks(rotation=45)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b20d118", + "metadata": {}, + "outputs": [], + "source": [ + "# create a confusion matrix for your model's predictions.\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm)\n", + "disp.plot()\n", + "plt.title('Confusion Matrix')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e12b6ebe", + "metadata": {}, + "outputs": [], + "source": [ + "#We already have a model above using KNN so we can use the results to compute fairness metrics\n", + "\n", + "# Compute fairness metrics using Fairlearn\n", + "\n", + "my_metrics = {\n", + " 'true positive rate' : true_positive_rate,\n", + " 'false positive rate' : false_positive_rate,\n", + " 'selection rate' : selection_rate,\n", + " 'count' : count\n", + "}\n", + "# Construct a MetricFrame for race\n", + "mf_race = MetricFrame(\n", + " metrics=my_metrics,\n", + " y_true=y_test,\n", + " y_pred=y_pred,\n", + " sensitive_features=X_test[\"xx1\"] # Replace with your first protected class\n", + ")\n", + "\n", + "# Construct a MetricFrame for sex\n", + "mf_sex = MetricFrame(\n", + " metrics=my_metrics,\n", + " y_true=y_test,\n", + " y_pred=y_pred,\n", + " sensitive_features=X_test[\"xx2\"] # Replace second protected class \n", + ") \n" ] } ], diff --git a/requirements.txt b/requirements.txt index 8d4a6b2..d0b07b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ debugpy==1.8.17 decorator==5.2.1 defusedxml==0.7.1 executing==2.2.1 +fairlearn==0.13.0 fastjsonschema==2.21.2 fonttools==4.60.1 fqdn==1.5.1 @@ -54,7 +55,9 @@ lark==1.3.1 MarkupSafe==3.0.3 matplotlib==3.10.7 matplotlib-inline==0.2.1 +mice==0.1.31 mistune==3.1.4 +narwhals==2.12.0 nbclient==0.10.2 nbconvert==7.16.6 nbformat==5.10.4 @@ -89,7 +92,7 @@ rfc3986-validator==0.1.1 rfc3987-syntax==1.1.0 rpds-py==0.29.0 scikit-learn==1.7.2 -scipy==1.16.3 +scipy==1.15.3 seaborn==0.13.2 Send2Trash==1.8.3 setuptools==80.9.0