import matplotlib.pyplot as plt
%cd /content
!git clone https://github.com/kskelly03/DataProject.git
%cd /content/DataProject
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import seaborn as sns
pd.set_option('display.max_columns', 500)

/content
fatal: destination path 'DataProject' already exists and is not an empty directory.
/content/DataProject

pantheon = pd.read_csv('pantheon.tsv', sep='\t')
pantheon.head()

pantheon.columns

Index(['en_curid', 'name', 'numlangs', 'birthcity', 'birthstate',
       'countryName', 'countryCode', 'countryCode3', 'LAT', 'LON',
       'continentName', 'birthyear', 'gender', 'occupation', 'industry',
       'domain', 'TotalPageViews', 'L_star', 'StdDevPageViews',
       'PageViewsEnglish', 'PageViewsNonEnglish', 'AverageViews', 'HPI'],
      dtype='object')

pantheon = pantheon.drop(columns = ["countryCode3", 'birthstate', 'PageViewsEnglish',
                                      'PageViewsNonEnglish', "numlangs"])

pantheon = pantheon.rename(columns = {'en_curid': 'wiki_id',
                                      'occupation': 'occupation',
                                      'birthyear': 'birth',
                                      'gender': 'gender',
                                      'birthcity': 'city',
                                      'countryName': 'country',
                                      'countryCode': 'country_code',
                                      'continentName': 'continent',
                                      'LAT': 'latitude',
                                      'LON': 'longitude',
                                      'industry': 'industry',
                                      'domain': 'domain',
                                      'TotalPageViews': 'total_page_views',
                                      'StdDevPageViews': 'stdDevPageViews',
                                      'AverageViews': 'average_views',
                                      })

pantheon = pantheon.iloc[:,[0, 1, 10, 9, 8, 2, 3, 4, 7, 17, 5, 6, 11, 12, 13, 14, 15, 16]]
def get_lower(entry):
    return entry.lower()
pantheon["occupation"] = pantheon["occupation"].apply(get_lower)
pantheon["industry"] = pantheon["industry"].apply(get_lower)
pantheon["domain"] = pantheon["domain"].apply(get_lower)

pantheon['birth'].replace('', np.nan, inplace=True)

# Remove non-numeric characters and convert to integers
pantheon['birth'] = pantheon['birth'].str.replace(r'\D', '', regex=True)
pantheon['birth'] = pantheon['birth'].replace('', np.nan).astype('Int64')

pantheon.dtypes #dtypes look good

wiki_id               int64
name                 object
occupation           object
gender               object
birth                 Int64
city                 object
country              object
country_code         object
continent            object
HPI                 float64
latitude            float64
longitude           float64
industry             object
domain               object
total_page_views      int64
L_star              float64
stdDevPageViews     float64
average_views       float64
dtype: object

pantheon.head() #much better

url = 'https://en.wikipedia.org/wiki/List_of_biographical_films'

# Send a request to the website
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
tables = soup.find_all('table', {'class': 'wikitable'})

# empty list to store dfs
dfs = []

for table in tables:
    df = pd.read_html(str(table))[0]
    dfs.append(df)

wiki_df = pd.concat(dfs, ignore_index=True)

# Drop the 'portrayed by' column
wiki_df = wiki_df.drop(columns=['Portrayed by', 'Lead actor or actress'])
# Also there are a lot of rows with NaN values so we can just drop them since we already have so many movies
wiki_df.dropna(inplace=True)
wiki_df

%cd /content
filepath = 'movies.csv'
imdb_movies = pd.read_csv(filepath, low_memory=False, error_bad_lines=False)
imdb_movies= imdb_movies.drop(columns = ['id', 'certificate', "stars_name", 'stars_id', 'description'])
imdb_movies.head()

/content

<ipython-input-92-e13c9e8a1240>:3: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  imdb_movies = pd.read_csv(filepath, low_memory=False, error_bad_lines=False)

imdb_movies['release_year'] = imdb_movies['year'].str.extract(r'(\d{4})', expand=False)
imdb_movies.drop(columns = ['year'], inplace = True)
imdb_movies.rename(columns = {'name':'title'}, inplace= True)
imdb_movies.head()

wiki_df.rename(columns={'Year': 'release_year', 'Film': 'title'}, inplace=True)
imdb_movies = imdb_movies.dropna(subset=['release_year'])
imdb_movies['release_year'] = imdb_movies['release_year'].astype(int)
biopics = pd.merge(imdb_movies, wiki_df, on=['release_year', 'title'], how='inner')
biopics

def convert_income_to_numeric(income):
    if ',' in income:
        income = income.replace(',', '')  # Remove commas if present

    if income.endswith('M'):
        return float(income.replace('$', '').replace('M', '')) * 1e6
    elif income.endswith('K'):
        return float(income.replace('$', '').replace('K', '')) * 1e3
    else:
        return float(income.replace('$', ''))

# Apply the function to convert gross_income to numeric
biopics['gross_income'] = biopics['gross_income'].apply(convert_income_to_numeric)


# Convert votes to integer
biopics['votes'] = biopics['votes'].apply(convert_income_to_numeric)

biopics = biopics.rename(columns ={"Subject(s)": "subject"})
biopics = biopics[['title', 'rating', 'release_year', 'subject', 'votes', 'gross_income', 'duration', 'genre', 'directors_name', 'directors_id']]
display(biopics.head())
print(biopics.dtypes) #Awesome
len(biopics)

title              object
rating            float64
release_year        int64
subject            object
votes             float64
gross_income      float64
duration           object
genre              object
directors_name     object
directors_id       object
dtype: object

2245

players = pantheon[pantheon['occupation'] == 'basketball player']
players.set_index('HPI', inplace = True)
players.sort_index(ascending = False, inplace = True)
players.head()

<ipython-input-97-3dd6a2f284ea>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players.sort_index(ascending = False, inplace = True)

players = pantheon[pantheon['occupation'] == 'basketball player']
players['HPI'].mean()

17.604417681971828

mathies = pantheon[pantheon['occupation'] == 'mathematician']
mathies['HPI'].mean()

24.06658977630573

plt.figure(figsize=(8, 6))
plt.hist(pantheon['HPI'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Historical Figure Popularity Index (HPI)')
plt.ylabel('Frequency')
plt.title('Histogram of Historical Figure Popularity Index')
plt.grid(axis='y', alpha=0.5)
plt.show()

biopics_release_years = biopics['release_year'].value_counts().sort_index()
movies_release_years = imdb_movies['release_year'].value_counts().sort_index()

# Filter data up to the year 2020
biopics_release_years_filtered = biopics_release_years[biopics_release_years.index <= 2020]
movies_release_years_filtered = movies_release_years[movies_release_years.index <= 2020]

# Normalize the counts
biopics_normalized = (biopics_release_years_filtered / biopics_release_years_filtered.sum()) * 100
movies_normalized = (movies_release_years_filtered / movies_release_years_filtered.sum()) * 100

plt.figure(figsize=(10, 6))
plt.plot(biopics_normalized.index, biopics_normalized.values, marker='o', linestyle='-', label='Biopics')
plt.plot(movies_normalized.index, movies_normalized.values, marker='o', linestyle='-', label='Movies')
plt.xlabel('Release Year')
plt.ylabel('Percentage of Films (%)')
plt.title('Normalized Biopics and Movies Count Over Release Years')
plt.legend()
plt.grid(True)
plt.xlim(left=min(biopics_normalized.index), right=2020)  # Limit x-axis to 2020
plt.show()

# Filter out ratings of 0.0
filtered_biopics = biopics[biopics['rating'] != 0.0]

plt.figure(figsize=(8, 6))
sns.boxplot(y='rating', data=filtered_biopics, orient='v', color='lightblue', width=0.5)
plt.ylabel('Rating')
plt.title('Ratings Distribution')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

lowest = filtered_biopics['rating'].min()
lowest_rated_movie = filtered_biopics[filtered_biopics['rating'] == lowest]
lowest_rated_movie

merged_df = pd.merge(pantheon, biopics, left_on='name', right_on='subject', how='inner')
merged_df.head()

# P-scoring HPI
merged_df['p_score_HPI'] = merged_df['HPI'].rank(pct=True)

# Scatter plot of p_scored HPI vs rating
plt.figure(figsize=(8, 6))
plt.scatter(merged_df[merged_df['rating'] <= 10]['p_score_HPI'], merged_df[merged_df['rating'] <= 10]['rating'], color='blue', alpha=0.5)

top_outliers = merged_df[(merged_df['rating'] > 8.5) & (merged_df['rating'] <= 10)].head(2)  # Select top 2 outliers
bottom_outliers = merged_df[(merged_df['rating'] < 3) & (merged_df['rating'] <= 10)].tail(2)  # Select bottom 2 outliers
selected_outliers = pd.concat([top_outliers, bottom_outliers])

# Labeling outliers on the plot
for i in range(len(selected_outliers)):
    plt.text(selected_outliers.iloc[i]['p_score_HPI'], selected_outliers.iloc[i]['rating'], f"{selected_outliers.iloc[i]['name']} - {selected_outliers.iloc[i]['title']}", fontsize=8)

plt.xlabel('p_score_HPI')
plt.ylabel('Rating')
plt.title('HPI vs Rating')
plt.grid(True)
plt.show()

merged_df = pd.merge(pantheon, biopics, left_on='name', right_on='subject', how='inner')
filtered_df = merged_df[(merged_df['gross_income'] != 0)].copy()
avg_ROI_per_figure = filtered_df.groupby('name')['gross_income'].mean().sort_values(ascending=False)

top_10_ROI = avg_ROI_per_figure.head(10)
plt.barh(top_10_ROI.index, top_10_ROI.values, color='skyblue')
plt.xlabel('Average Gross Income')
plt.title('Whose Biopics Make the Most Money')
plt.gca().invert_yaxis()
plt.show()

merged_df = pd.get_dummies(merged_df, columns=['occupation'], prefix = 'occupation')
occupation_columns = [col for col in merged_df.columns if col.startswith('occupation_')]

def calculate_percentile(group):
    group['HPI_percentile'] = group['HPI'].rank(pct=True)
    return group

merged_df['HPI_percentile'] = merged_df.groupby(occupation_columns)['HPI'].transform(lambda x: x.rank(pct=True))
display(merged_df)
merged_df = pd.get_dummies(merged_df, columns=['industry'], prefix = 'industry')

merged_df.rename(columns ={'HPI_percentile': 'HPI_percentile_occupation'}, inplace = True)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

occupation_columns = [col for col in merged_df.columns if col.startswith('occupation_')]
industry_columns = [col for col in merged_df.columns if col.startswith('industry_')]

# Original Model without directors_id
X = merged_df[['HPI_percentile_occupation', 'stdDevPageViews', 'release_year']+ occupation_columns + industry_columns]
y = (merged_df['rating'] > merged_df['rating'].mean()).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

KNeighborsRegressor()

KNeighborsRegressor()

from sklearn.metrics import accuracy_score, classification_report

predictions = knn.predict(X_test_scaled)
binary_predictions = (predictions > 0.5).astype(int)
accuracy = accuracy_score(y_test, binary_predictions)
report = classification_report(y_test, binary_predictions)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.6277372262773723
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.65      0.67        80
           1       0.55      0.60      0.57        57

    accuracy                           0.63       137
   macro avg       0.62      0.62      0.62       137
weighted avg       0.63      0.63      0.63       137

count_0 = sum(binary_predictions == 0)
count_1 = sum(binary_predictions == 1)
counts = [count_0, count_1]
labels = ['Below Mean', 'Above Mean']

# Plotting the pie chart
plt.figure(figsize=(6, 6))
plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightgreen'])
plt.title('Proportion of Movies Predicted')
plt.axis('equal')
plt.show()

from sklearn.metrics import confusion_matrix
import seaborn as sns
conf_matrix = confusion_matrix(y_test, binary_predictions)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Trying it with directors_id
label_encoder = LabelEncoder()
occupation_columns = [col for col in merged_df.columns if col.startswith('occupation_')]
industry_columns = [col for col in merged_df.columns if col.startswith('industry_')]
merged_df['directors_id_encoded'] = label_encoder.fit_transform(merged_df['directors_id'])
X = merged_df[['HPI_percentile_occupation', 'stdDevPageViews', 'release_year'] + occupation_columns + industry_columns + ['directors_id_encoded']]
y = (merged_df['rating'] > merged_df['rating'].mean()).astype(int)
merged_df = merged_df[merged_df['directors_id'] != 'Anonymous']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
predictions = knn.predict(X_test_scaled)
binary_predictions = (predictions > 0.5).astype(int)
accuracy = accuracy_score(y_test, binary_predictions)
report = classification_report(y_test, binary_predictions)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.656934306569343
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.74      0.72        80
           1       0.60      0.54      0.57        57

    accuracy                           0.66       137
   macro avg       0.65      0.64      0.64       137
weighted avg       0.65      0.66      0.65       137

	en_curid	name	numlangs	birthcity	birthstate	countryName	countryCode	countryCode3	LAT	LON	continentName	birthyear	gender	occupation	industry	domain	TotalPageViews	L_star	StdDevPageViews	PageViewsEnglish	PageViewsNonEnglish	AverageViews	HPI
0	307	Abraham Lincoln	131	Hodgenville	KY	UNITED STATES	US	USA	37.571111	-85.738611	North America	1809	Male	POLITICIAN	GOVERNMENT	INSTITUTIONS	66145211	5.801387	586914.72200	41477236	24667975	504925.2748	27.938585
1	308	Aristotle	152	Stageira	NaN	Greece	GR	GRC	40.333333	23.500000	Europe	-384	Male	PHILOSOPHER	PHILOSOPHY	HUMANITIES	56355172	11.914597	201067.46070	15745351	40609821	370757.7105	31.993795
2	339	Ayn Rand	55	Saint Petersburg	NaN	Russia	RU	RUS	59.950000	30.300000	Europe	1905	Female	WRITER	LANGUAGE	HUMANITIES	14208218	3.175685	87632.49020	11023490	3184728	258331.2364	24.325936
3	595	Andre Agassi	69	Las Vegas	NV	UNITED STATES	US	USA	36.121514	-115.173851	North America	1970	Male	TENNIS PLAYER	INDIVIDUAL SPORTS	SPORTS	11244030	6.242525	85553.31810	6353888	4890142	162956.9565	20.925999
4	628	Aldous Huxley	62	Godalming	NaN	UNITED KINGDOM	GB	GBR	51.185000	-0.610000	Europe	1894	Male	WRITER	LANGUAGE	HUMANITIES	9268920	6.219842	33037.03209	5137256	4131664	149498.7097	25.996605

	wiki_id	name	occupation	gender	birth	city	country	country_code	continent	HPI	latitude	longitude	industry	domain	total_page_views	L_star	stdDevPageViews	average_views
0	307	Abraham Lincoln	politician	Male	1809	Hodgenville	UNITED STATES	US	North America	27.938585	37.571111	-85.738611	government	institutions	66145211	5.801387	586914.72200	504925.2748
1	308	Aristotle	philosopher	Male	384	Stageira	Greece	GR	Europe	31.993795	40.333333	23.500000	philosophy	humanities	56355172	11.914597	201067.46070	370757.7105
2	339	Ayn Rand	writer	Female	1905	Saint Petersburg	Russia	RU	Europe	24.325936	59.950000	30.300000	language	humanities	14208218	3.175685	87632.49020	258331.2364
3	595	Andre Agassi	tennis player	Male	1970	Las Vegas	UNITED STATES	US	North America	20.925999	36.121514	-115.173851	individual sports	sports	11244030	6.242525	85553.31810	162956.9565
4	628	Aldous Huxley	writer	Male	1894	Godalming	UNITED KINGDOM	GB	Europe	25.996605	51.185000	-0.610000	language	humanities	9268920	6.219842	33037.03209	149498.7097

	Year	Film	Subject(s)
0	1906	The Story of the Kelly Gang	Ned Kelly
1	1909	Origin of Beethoven's Moonlight Sonata	Ludwig van Beethoven
2	1909	The Life of Moses	Moses
3	1909	Saul and David	King David
4	1909	Saul and David	King Saul
...	...	...	...
3288	2024	Untitled Snoop Dogg biopic film	Snoop Dogg
3289	2024	A Complete Unknown	Bob Dylan
3290	2024	Race for Glory: Audi vs. Lancia	Roland Gumpert
3291	2024	Race for Glory: Audi vs. Lancia	Cesare Fiorio
3292	2024	Race for Glory: Audi vs. Lancia	Walter Röhrl

	name	year	rating	duration	genre	votes	directors_id	directors_name
0	Best in Sex: 2015 AVN Awards	(2015 TV Special)	4.0	94 min	Adult, News	124.0	nm1624094	Gary Miller
1	Naughty Novelist	(2008 Video)	3.8	88 min	Adult	174.0	nm0045256	John Bacchus
2	2011 AVN Awards Show	(2011 TV Special)	5.7	83 min	Adult, News	39.0	nm1624094,nm0754845	Gary Miller,Timothy E. Sabo
3	Best in Sex: 2017 AVN Awards	(2017 TV Special)	4.9	87 min	Adult, News	225.0	nm1624094	Gary Miller
4	AVN Awards 2014	(2014 TV Special)	6.7	82 min	Adult, News	101.0	nm1624094	Gary Miller

	title	rating	duration	genre	votes	directors_id	directors_name	release_year
0	Best in Sex: 2015 AVN Awards	4.0	94 min	Adult, News	124.0	nm1624094	Gary Miller	2015
1	Naughty Novelist	3.8	88 min	Adult	174.0	nm0045256	John Bacchus	2008
2	2011 AVN Awards Show	5.7	83 min	Adult, News	39.0	nm1624094,nm0754845	Gary Miller,Timothy E. Sabo	2011
3	Best in Sex: 2017 AVN Awards	4.9	87 min	Adult, News	225.0	nm1624094	Gary Miller	2017
4	AVN Awards 2014	6.7	82 min	Adult, News	101.0	nm1624094	Gary Miller	2014

Project Goals¶

Dataset Descriptions¶

Cleaning up the Pantheon¶

Creating and Tidying our Biopics¶

Let's Look Into Our Data¶

Historical Popularity (HPI) Histogram¶

Biopic vs. Release Year¶

Distribution of Rating for Biopics¶

Creating a Scatterplot for HPI and rating¶

Visualizing the Average Gross Income for Pantheon-Figures in Movies¶

Modeling¶

Works Cited¶

	title	rating	duration	genre	votes	gross_income	directors_id	directors_name	release_year	Subject(s)
0	The Staircase	7.2	519 min	Biography, Crime, Drama	14,542	0	Anonymous	nm0000000	2022	Michael Peterson
1	The Staircase	7.2	519 min	Biography, Crime, Drama	14,542	0	Anonymous	nm0000000	2022	Kathleen Peterson
2	The Staircase	11.0	0 min	Short, Mystery, Thriller	0	0	nm13631287	SreeramSivanand	2022	Michael Peterson
3	The Staircase	11.0	0 min	Short, Mystery, Thriller	0	0	nm13631287	SreeramSivanand	2022	Kathleen Peterson
4	House of Gucci	6.6	158 min	Crime, Drama	108,469	57,300,000	nm0000631	Ridley Scott	2021	Patrizia Reggiani
...	...	...	...	...	...	...	...	...	...	...
2240	Seine einzige Liebe	11.0	0 min	Biography, Drama	0	0	nm0361518	Emmerich Hanus	1947	Franz Schubert
2241	Docteur Laennec	6.4	95 min	Biography, Drama	23	0	nm0166996	Maurice Cloche	1949	René Laennec
2242	Rasputín	11.0	0 min	Biography	0	0	nm2466657	Ernesto Mas	1958	Grigori Rasputin
2243	Why America Will Win	11.0	70 min	Biography, Drama	0	0	nm0822801	Richard Stanton	1918	John J. Pershing
2244	My Mother	11.0	26 min	Short, Biography	0	0	Anonymous	nm0000000	1917	Abraham Lincoln

	wiki_id	name	occupation	gender	birth	city	country	country_code	continent	latitude	longitude	industry	domain	total_page_views	L_star	stdDevPageViews	average_views
HPI
23.841241	16899	Kareem Abdul-Jabbar	basketball player	Male	1947	New York	UNITED STATES	US	North America	40.712700	-74.005900	team sports	sports	11460425	4.429107	62602.57056	260464.2045
23.815956	255645	Wilt Chamberlain	basketball player	Male	1936	Philadelphia	UNITED STATES	US	North America	39.950000	-75.166667	team sports	sports	10629180	3.555347	69520.05013	241572.2727
23.619816	20455	Michael Jordan	basketball player	Male	1963	New York	UNITED STATES	US	North America	40.692778	-73.990278	team sports	sports	49879093	5.077657	262167.12810	665054.5733
22.403537	36753	Magic Johnson	basketball player	Male	1959	Lansing	UNITED STATES	US	North America	42.733611	-84.546667	team sports	sports	16987222	4.048427	90796.03641	320513.6226
21.838553	412214	Bill Russell	basketball player	Male	1934	West Monroe	UNITED STATES	US	North America	32.510833	-92.140000	team sports	sports	4944312	2.840501	48678.49348	159493.9355