second_iteration.py
Toggle Theme
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def main() -> None:
# Load the dataset
cuisines_df: pd.DataFrame = pd.read_csv("./cuisines.csv", index_col=0)
# Get the list of unique cuisines
target_cuisines: np.ndarray[str] = cuisines_df["cuisine"].unique()
# For each cuisine, process and plot ingredient data
for cuisine in target_cuisines:
sorted_ingredients: pd.DataFrame = get_sorted_cuisine_ingredients(
cuisines_df, cuisine
)
plot_cuisine_ingredients(sorted_ingredients, cuisine)
def get_sorted_cuisine_ingredients(
df: pd.DataFrame,
cuisine_name: str,
) -> pd.DataFrame:
"""
Filters a DataFrame by cuisine and returns a sorted DataFrame of ingredient counts.
"""
# Filter rows matching the given cuisine (case-insensitive)
filtered = df[df["cuisine"].str.lower() == cuisine_name.lower()]
# Sum the ingredient counts, dropping the 'cuisine' column
ingredient_totals: pd.Series = (
filtered
.T
.drop(["cuisine"])
.sum(axis=1)
)
# Convert to DataFrame for plotting
counts_df: pd.DataFrame = ingredient_totals.to_frame(name="value")
# Keep only ingredients that are actually used
present_ingredients: pd.DataFrame = counts_df[counts_df["value"] != 0]
# Sort by frequency
sorted_ingredients = present_ingredients.sort_values(by="value", ascending=False)
return sorted_ingredients
def plot_cuisine_ingredients(
df_to_plot: pd.DataFrame, cuisine_name: str, top_n: int = 10
) -> None:
"""
Plots the top N most common ingredients for a given cuisine.
"""
# Get plot title
title: str = f"Top {top_n} Most Common {cuisine_name.capitalize()} Ingredients"
# Plot horizontal bar chart
df_to_plot.head(top_n).plot.barh(title=title)
plt.xlabel("Count")
plt.ylabel("Ingredient")
plt.gca().invert_yaxis()
plt.show()
if __name__ == "__main__":
main()