mirror of
https://github.com/harivansh-afk/DS1001-LABS-Projects.git
synced 2026-04-15 09:01:15 +00:00
356 lines
22 KiB
Text
356 lines
22 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "879c2ef2",
|
|
"metadata": {},
|
|
"source": [
|
|
"# LABS-6: Design Project\n",
|
|
"\n",
|
|
"In this notebook you will run and edit the code to perform some exploratory data analytics (EDA) and to develop and answer a question using data.\n",
|
|
"\n",
|
|
"**Data**\\\n",
|
|
"This dataset comes from IMDB and can be accessed on [Kaggle](https://www.kaggle.com/datasets/ashpalsingh1525/imdb-movies-dataset)."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0334b281",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Set up environment"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "8ca62288",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## import packages\n",
|
|
"\n",
|
|
"import pandas as pd #data manipulation & analysis\n",
|
|
"import numpy as np #arrays & math\n",
|
|
"import matplotlib.pyplot as plt #data visualization\n",
|
|
"import seaborn as sns #statistical visualization\n",
|
|
"from scipy.stats import gaussian_kde #scientific computing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "a9110372",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "FileNotFoundError",
|
|
"evalue": "[Errno 2] No such file or directory: 'imdb_movies.csv'",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Read in data\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimdb_movies.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
|
"File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
|
"File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/util/_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[1;32m 326\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 327\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[1;32m 328\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[1;32m 329\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 330\u001b[0m )\n\u001b[0;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
|
"File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m 935\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 936\u001b[0m dialect,\n\u001b[1;32m 937\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 946\u001b[0m defaults\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdelimiter\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m\"\u001b[39m},\n\u001b[1;32m 947\u001b[0m )\n\u001b[1;32m 948\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 950\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
|
|
"File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 602\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 604\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 605\u001b[0m parser \u001b[38;5;241m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[1;32m 607\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 608\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
|
|
"File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1439\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1441\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1442\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_engine(f, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine)\n",
|
|
"File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1733\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1734\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1735\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m get_handle(\n\u001b[1;32m 1736\u001b[0m f,\n\u001b[1;32m 1737\u001b[0m mode,\n\u001b[1;32m 1738\u001b[0m encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1739\u001b[0m compression\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompression\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1740\u001b[0m memory_map\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmemory_map\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[1;32m 1741\u001b[0m is_text\u001b[38;5;241m=\u001b[39mis_text,\n\u001b[1;32m 1742\u001b[0m errors\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding_errors\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstrict\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 1743\u001b[0m storage_options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstorage_options\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1744\u001b[0m )\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
|
|
"File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 851\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 852\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 853\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 855\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 856\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 857\u001b[0m handle,\n\u001b[1;32m 858\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m 859\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m 860\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m 861\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 862\u001b[0m )\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 864\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 865\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
|
|
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'imdb_movies.csv'"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Read in data\n",
|
|
"data = pd.read_csv(\"imdb_movies.csv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "fcbfbe2a",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Understand your data\n",
|
|
"\n",
|
|
"First we want to understand what the data looks like and how much of it there is. \\\n",
|
|
"To do that, we can start by looking at: \n",
|
|
"- the shape of the data\n",
|
|
"- viewing a portion of the data\n",
|
|
"- checking the data types for each column\n",
|
|
"- looking at the summary stats of our numeric columns\n",
|
|
"\n",
|
|
"The data is stored in a [pandas dataframe](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html). This is a tabular data structure that can store different types of data like numbers and text. Dataframes are an object in python, and therfore have attributes and methods that we can use to help us understand the data. You will learn several of these below."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4e214f81",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Shape\n",
|
|
"\n",
|
|
"The [shape](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shape.html#pandas.DataFrame.shape) attribute tells us the shape of the data, or how many rows and columns are in our dataset. \n",
|
|
"\n",
|
|
"Since this is an attribute, we call it using this format: `df.shape` where `df` is the name of your dataframe.\n",
|
|
"\n",
|
|
"This attribute returns the shape in this format: (# rows, # columns)\n",
|
|
"\n",
|
|
"Run the cell below to get the shape of the dataframe and answer **question 1**.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "36d5224c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3ae4e773",
|
|
"metadata": {},
|
|
"source": [
|
|
"### View a portion of the Data\n",
|
|
"\n",
|
|
"It is useful to be able to see what our data actually looks like - but we usually only need to see a few rows to understand what it looks like. Dataframes have a method called [`.head()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html#pandas.DataFrame.head) that displays the first *n* rows of the dataframe. \n",
|
|
"\n",
|
|
"Since this is a method, we call it using this format: `df.head()` where `df` is the name of your dataframe.\n",
|
|
"\n",
|
|
"You can specify how many rows you want to see by specifying the *n* parameter inside the parenthesis. Let's look at the first 10 rows of our dataframe. Run the cell below to see data."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "37fba2e3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data.head(n=10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "32365044",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Check data types for each column\n",
|
|
"\n",
|
|
"Now we will check the data types so we know how to handle each column during cleaning and analysis. The [`.info()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.info.html#pandas.DataFrame.info) method gives us a quick summary of the dataset. It tells us how many entries (rows) there are, the names of the columns, the data type stored in each column, and how many values are missing.\n",
|
|
"\n",
|
|
"Since this is a method, we call it like a function using this format: `df.info()` where `df` is the name of your DataFrame.\n",
|
|
"\n",
|
|
"This method prints the details directly to the workspace.\n",
|
|
"\n",
|
|
"\n",
|
|
"**In the cell below, write the code to call the .info() method on `data`. Use the previous examples and documentation linked above to help you.**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c7f132c8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# ENTER YOUR CODE TO RUN .info() HERE\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "70d4b227",
|
|
"metadata": {},
|
|
"source": [
|
|
"Here are a few helpful tips to help you answer **question 5**:\n",
|
|
"- Python stores numbers either as integers (int) or floating point decimals (float). Python stores other data types (like strings and lists) as objects.\n",
|
|
"- \"non-null\" means rows without missing values."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3391311a",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Look at summary stats for numeric columns\n",
|
|
"\n",
|
|
"Now we will summarize our columns to spot patterns and outliers during cleaning and analysis. The [`.describe()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html#pandas.DataFrame.describe) method gives us a quick set of summary statistics. By default, it reports the following values for numeric columns: \n",
|
|
"- count\n",
|
|
"- mean\n",
|
|
"- standard deviation\n",
|
|
"- minimum\n",
|
|
"- quartiles (25%, 50%, 75%)\n",
|
|
"- maximum\n",
|
|
"\n",
|
|
"Since this is a method, we call it using this format: `df.describe()` where `df` is the name of your DataFrame.\n",
|
|
"\n",
|
|
"This method returns a new DataFrame of summary statistics.\n",
|
|
"\n",
|
|
"**In the cell below, write the code to call the .describe() method on `data`. Use the previous examples and documentation linked above to help you.**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "77c2b43b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# ENTER YOUR CODE TO RUN .describe() HERE\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ae52bc19",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Visualize the Data\n",
|
|
"\n",
|
|
"Now we want to visualize the data to see if we can pull out any initial trends off the bat. These will inform the questions we ask and our modeling process later on. We will use 3 common graphs to look at some baseline relationships and distributions of our data:\n",
|
|
"- Scatterplot\n",
|
|
"- Bar chart\n",
|
|
"- Density plot\n",
|
|
"\n",
|
|
"It is important to remember that the point of this process it to understand trends and distributions of your data. There are many other plots data scientists use to do this - these are just a few basic ones.\n",
|
|
"\n",
|
|
"To visualize the data we will use 2 packages: [`MatPlotLib`](https://matplotlib.org/) and [`Seaborn`](https://seaborn.pydata.org/). These were imported at the beginning of the notebook with aliases - short hand that we can use to reference the package in our code. The alias for `MatPlotLib` is `plt` and `Seaborn` is `sns`. These are the standard aliases used for these packages. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4d8c80db",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Scatterplot\n",
|
|
"\n",
|
|
"[Scatterplots](https://matplotlib.org/stable/gallery/shapes_and_collections/scatter.html) show the relationship between 2 or more variables. You can add other features, like point shape and color, to your graph to see relationships of additional variables.\n",
|
|
"\n",
|
|
"Run the cell below to create a scatterplot of the relationship between movie budget and revenue."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "063767fb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"fig, ax = plt.subplots(figsize=(15, 10))\n",
|
|
"ax.scatter(data['budget_x'], data['revenue'])\n",
|
|
"\n",
|
|
"plt.ylabel(\"Revenue (in billions USD)\")\n",
|
|
"plt.xlabel(\"Budget (in one hundred millions USD)\")\n",
|
|
"plt.title(\"Budget and Revenue relationship\")\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "08493162",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Bar Chart\n",
|
|
"\n",
|
|
"A [Bar Chart](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.bar.html#matplotlib.pyplot.bar) shows comparisons between categories by using rectangular bars. The height of each bar represents the value or frequency of that category. You can customize bar color, orientation, and grouping to highlight patterns or differences across multiple variables.\n",
|
|
"\n",
|
|
"The cell below has the code to produce a bar chart using the default color settings. **Use the documentation linked above to change the color of the bars.**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "405a583b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"country_counts = data['country'].value_counts()\n",
|
|
"\n",
|
|
"fig, ax = plt.subplots(figsize=(15, 10))\n",
|
|
"ax.bar(country_counts.index, country_counts.values) \n",
|
|
"ax.tick_params(\"x\", rotation=45)\n",
|
|
"\n",
|
|
"plt.ylabel(\"count\")\n",
|
|
"plt.xlabel(\"country\")\n",
|
|
"plt.title(\"Count of movies per country\")\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f326d1e9",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Density plot\n",
|
|
"\n",
|
|
"A [Density Plot](https://seaborn.pydata.org/generated/seaborn.kdeplot.html) shows the distribution of a continuous variable by estimating its probability density function. This is similar to histograms, but density plots use smooth curves to represent data, making it easier to compare distributions and spot patterns. You can adjust the smoothness and overlay multiple curves to explore differences across groups.\n",
|
|
"\n",
|
|
"The cell below has the code to produce a Density plot, but is missing a title. **Add the code to produce a title before running the cell.**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "65c74839",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"x = data['score']\n",
|
|
"plt.figure(figsize=(15, 10))\n",
|
|
"sns.kdeplot(data=data, x=\"score\")\n",
|
|
"\n",
|
|
"## ADD CODE TO PRODUCE TITLE HERE\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "69e7fafc",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Develop and Answer a Question"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e5078e47",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"correlation_matrix = data[['score', 'budget_x', 'revenue']].corr()\n",
|
|
"\n",
|
|
"plt.figure(figsize=(15, 10))\n",
|
|
"sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=\".2f\")\n",
|
|
"plt.title('Correlation Matrix Heatmap')\n",
|
|
"plt.show()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|