import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats


from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


df_1 = pd.read_csv('/content/drive/MyDrive/CMSC320 Final Project/dataset.csv')
df = df_1.drop_duplicates(subset=['track_id'])
df = df.rename(columns={"Unnamed: 0": "ID"})
df = df.drop(columns=['ID'])
display(len(df_1))
display(len(df))
display("Number of Duplicates: " + str(len(df_1) - len(df)))
display(df)

114000

89741

'Number of Duplicates: 24259'


df_mean = df.groupby('track_genre')['popularity'].mean().sort_values(ascending=False)
df2 = df_mean.reset_index()
df2.set_index('track_genre', inplace=True)
display(df2.head(30))


plt.figure(figsize=(10, 6))
df2.head(30)['popularity'].plot(kind='bar', color='green')
plt.title('Top 30 Genres by Mean Popularity')
plt.xlabel('Genre')
plt.ylabel('Mean Popularity')
plt.show()


display(df2.tail(10))


plt.figure(figsize=(10, 6))
df2.tail(10)['popularity'].plot(kind='bar', color='green')
plt.title('Bottom 10 Genres by Mean Popularity')
plt.xlabel('Genre')
plt.ylabel('Mean Popularity')
plt.show()


pop_data = df[df['track_genre'] == 'pop'][['danceability', 'popularity']].dropna()
hiphop_data = df[df['track_genre'] == 'hip-hop'][['danceability', 'popularity']].dropna()
print(pop_data)

pop_corr, pop_p = stats.pearsonr(pop_data['danceability'], pop_data['popularity'])

print(f'pop correlation: {pop_corr}, pop p value: {pop_p}')

       danceability  popularity
81000         0.514          91
81004         0.679          90
81006         0.724          74
81009         0.772          76
81012         0.410          90
...             ...         ...
81989         0.599          64
81990         0.668          64
81992         0.766          63
81993         0.741          64
81994         0.876          64

[416 rows x 2 columns]
pop correlation: 0.10307501506514821, pop p value: 0.03558825695481653


plt.scatter(pop_data['danceability'], pop_data['popularity'])
plt.title('Effect of Danceability on Popularity for Pop Music')
plt.xlabel('Danceability')
plt.ylabel('Popularity')

Text(0, 0.5, 'Popularity')


pop_data_clean = pop_data.dropna()
hiphop_data_clean = hiphop_data.dropna()

pop_data = df[df['track_genre'] == 'pop'][['liveness', 'popularity']].dropna()
hiphop_data = df[df['track_genre'] == 'hip-hop'][['liveness', 'popularity']].dropna()

print(stats.ttest_ind(pop_data, hiphop_data, equal_var=False))

plt.scatter(pop_data['liveness'], pop_data['popularity'], label="Pop")
plt.scatter(hiphop_data['liveness'], hiphop_data['popularity'], label="Hip-Hop")
plt.title('Effect of Liveness on Popularity for Pop and Hip-Hop Music')
plt.legend()
plt.xlabel('Liveness')
plt.ylabel('Popularity')

TtestResult(statistic=array([-3.9950668 , -0.23588592]), pvalue=array([6.89804197e-05, 8.13586897e-01]), df=array([1099.34611991,  735.13326543]))

Text(0, 0.5, 'Popularity')


pop = df[df['track_genre'] == 'pop']['tempo'].dropna()
rock = df[df['track_genre'] == 'rock']['tempo'].dropna()
jazz = df[df['track_genre'] == 'jazz']['tempo'].dropna()
allthree = df[(df['track_genre'] == 'pop') & (df['track_genre'] == 'rock') & (df['track_genre'] == 'jazz')]
allthree = df[df['track_genre'].isin(['pop', 'rock', 'jazz'])]

print(stats.f_oneway(pop, rock, jazz))

# plt.scatter(allthree['track_genre'], allthree['tempo'])
# plt.xlabel('Genres')
# plt.ylabel('Tempo')
plt.hist(df[df['track_genre'] == 'jazz']['tempo'], label="Jazz")
plt.hist(df[df['track_genre'] == 'pop']['tempo'], label="Pop")
plt.hist(df[df['track_genre'] == 'rock']['tempo'], label="Rock")
plt.legend()
plt.title('Distribution of Tempo for Jazz, Pop, and Rock')
plt.xlabel('Tempo')
plt.ylabel('Number of occurrences')

F_onewayResult(statistic=5.031362739035586, pvalue=0.006659749277153722)

Text(0, 0.5, 'Number of occurrences')


from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


df_pt = df[['track_genre', 'popularity', 'tempo']]
df_filtered = df_pt[df_pt['track_genre'] == 'k-pop']
df_pt_filtered = df_filtered[['popularity', 'tempo']]
# df_encoded = pd.get_dummies(df_p, columns=['track_genre'])
# df_encoded

X = df_pt_filtered.drop('popularity', axis=1)
y = df_pt_filtered['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

plt.scatter(X_train, y_train)
plt.title('Effect of Tempo on Popularity of K-pop')
plt.xlabel('tempo')
plt.ylabel('popularity')
plt.tight_layout()
plt.show()

y_train_pred = LinearRegression().fit(X_train, y_train).predict(X_train)
y_test_pred = LinearRegression().fit(X_test, y_test).predict(X_test)

plt.scatter(X_train, y_train)
plt.plot(X_train, y_train_pred, color='green', label='Linear Regression')
plt.title('Effect of Tempo on Popularity of K-pop With Linear Regression')
plt.xlabel('tempo')
plt.ylabel('popularity')
plt.tight_layout()
plt.show()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(LinearRegression(), X_train_scaled, y_train, cv=skf)

print(f"Cross-validation accuracy (mean): {cv_scores.mean()}")
print(f"Cross-validation accuracy (std): {cv_scores.std()}\n")

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"MSE Train: {mse_train}")
print(f"MSE Test: {mse_test}")
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")


# using a linear regression, we can see that there is not much of a correlation
# between popularity and tempo, as shown with k-pop as an example

Cross-validation accuracy (mean): -0.0011768987242749551
Cross-validation accuracy (std): 0.003464192059949629

MSE Train: 133.5647178732994
MSE Test: 192.24413045740053
R2 Train: 0.0014804277957974898
R2 Test: 0.008167105607016056

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(


df_pd = df[['track_genre', 'popularity', 'danceability']]
df_filtered = df_pd[df_pd['track_genre'] == 'k-pop']
df_pd_filtered = df_filtered[['popularity', 'danceability']]

X = df_pt_filtered.drop('popularity', axis=1)
y = df_pt_filtered['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

plt.scatter(X_train, y_train)
plt.title('Effect of Danceability on Popularity for K-pop')
plt.xlabel('danceability')
plt.ylabel('popularity')
plt.tight_layout()
plt.show()

y_train_pred = LinearRegression().fit(X_train, y_train).predict(X_train)
y_test_pred = LinearRegression().fit(X_test, y_test).predict(X_test)

plt.scatter(X_train, y_train)
plt.plot(X_train, y_train_pred, color='green', label='Linear Regression')
plt.title('Effect of Danceability on Popularity for K-pop With Linear Regression')
plt.xlabel('danceability')
plt.ylabel('popularity')
plt.tight_layout()
plt.show()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(LinearRegression(), X_train_scaled, y_train, cv=skf)

print(f"Cross-validation accuracy (mean): {cv_scores.mean()}")
print(f"Cross-validation accuracy (std): {cv_scores.std()}\n")

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"MSE Train: {mse_train}")
print(f"MSE Test: {mse_test}")
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")

Cross-validation accuracy (mean): -0.0011768987242749551
Cross-validation accuracy (std): 0.003464192059949629

MSE Train: 133.5647178732994
MSE Test: 192.24413045740053
R2 Train: 0.0014804277957974898
R2 Test: 0.008167105607016056

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(


df_pd = df[['popularity', 'danceability']]
df_pt = df[['popularity', 'tempo']]

X1 = df_pd.drop('popularity', axis=1)
y1 = df_pd['popularity']

X2 = df_pt.drop('popularity', axis=1)
y2 = df_pt['popularity']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=42)

plt.scatter(X1_train, y1_train)
plt.title('Effect of Danceability on Popularity with all genres')
plt.xlabel('danceability')
plt.ylabel('popularity')
plt.tight_layout()
plt.show()

plt.scatter(X2_train, y2_train)
plt.title('Effect of Tempo on Popularity with all genres')
plt.xlabel('tempo')
plt.ylabel('popularity')
plt.tight_layout()
plt.show()

	track_id	artists	album_name	track_name	popularity	duration_ms	explicit	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	time_signature	track_genre
0	5SuOikwiRyPMVoIQDJUgSV	Gen Hoshino	Comedy	Comedy	73	230666	False	0.676	0.4610	1	-6.746	0	0.1430	0.0322	0.000001	0.3580	0.7150	87.917	4	acoustic
1	4qPNDBW1i3p13qLCt0Ki3A	Ben Woodward	Ghost (Acoustic)	Ghost - Acoustic	55	149610	False	0.420	0.1660	1	-17.235	1	0.0763	0.9240	0.000006	0.1010	0.2670	77.489	4	acoustic
2	1iJBSr7s7jYXzM8EGcbK5b	Ingrid Michaelson;ZAYN	To Begin Again	To Begin Again	57	210826	False	0.438	0.3590	0	-9.734	1	0.0557	0.2100	0.000000	0.1170	0.1200	76.332	4	acoustic
3	6lfxq3CG4xtTiEg7opyCyx	Kina Grannis	Crazy Rich Asians (Original Motion Picture Sou...	Can't Help Falling In Love	71	201933	False	0.266	0.0596	0	-18.515	1	0.0363	0.9050	0.000071	0.1320	0.1430	181.740	3	acoustic
4	5vjLSffimiIP26QG5WcN2K	Chord Overstreet	Hold On	Hold On	82	198853	False	0.618	0.4430	2	-9.681	1	0.0526	0.4690	0.000000	0.0829	0.1670	119.949	4	acoustic
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
113995	2C3TZjDRiAzdyViavDJ217	Rainy Lullaby	#mindfulness - Soft Rain for Mindful Meditatio...	Sleep My Little Boy	21	384999	False	0.172	0.2350	5	-16.393	1	0.0422	0.6400	0.928000	0.0863	0.0339	125.995	5	world-music
113996	1hIz5L4IB9hN3WRYPOCGPw	Rainy Lullaby	#mindfulness - Soft Rain for Mindful Meditatio...	Water Into Light	22	385000	False	0.174	0.1170	0	-18.318	0	0.0401	0.9940	0.976000	0.1050	0.0350	85.239	4	world-music
113997	6x8ZfSoqDjuNa5SVP5QjvX	Cesária Evora	Best Of	Miss Perfumado	22	271466	False	0.629	0.3290	0	-10.895	0	0.0420	0.8670	0.000000	0.0839	0.7430	132.378	4	world-music
113998	2e6sXL2bYv4bSz6VTdnfLs	Michael W. Smith	Change Your World	Friends	41	283893	False	0.587	0.5060	7	-10.889	1	0.0297	0.3810	0.000000	0.2700	0.4130	135.960	4	world-music
113999	2hETkH7cOfqmz3LqZDHZf5	Cesária Evora	Miss Perfumado	Barbincor	22	241826	False	0.526	0.4870	1	-10.204	0	0.0725	0.6810	0.000000	0.0893	0.7080	79.198	4	world-music

	popularity
track_genre
k-pop	59.358779
pop-film	59.096933
metal	56.422414
chill	53.738683
latino	51.788945
sad	51.109929
grunge	50.587007
indian	49.765348
anime	48.776884
emo	48.500000
reggaeton	48.270270
sertanejo	47.860775
piano	46.608312
progressive-house	46.537748
hard-rock	45.744711
pagode	45.585799
deep-house	45.573045
mandopop	45.071019
british	44.768889
metalcore	44.708914
brazil	44.645678
electronic	44.234940
ambient	44.208208
singer-songwriter	43.592030
acoustic	42.483000
hip-hop	42.429929
pop	41.944712
punk	41.884956
forro	41.831663
world-music	41.536295

	popularity
track_genre
idm	15.522222
kids	14.770791
grindcore	14.521827
classical	13.362168
chicago-house	12.333667
detroit-techno	11.130753
latin	9.855072
jazz	9.790076
romance	3.549779
iranian	2.224696

Popularity of Music Explained

Spring 2024 Data Science Project By: Areeb Malik, Arjun Sharma, Katya Kiryutin

1: Introduction

2: Managing our Data

2.1: Cleaning our Data

3: Exploratory analysis: Finding popular and least popular Genres

3.1: Testing our Data

4. Primary analysis and visualization

5. Insights and Conclusions