import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 1/6
8/12/2019 MovieLens_DataAnalysis
In [2]: st = default_timer()
print(ratings.dtypes)
# Split title and release year in separate columns in movies dataframe. Convert y
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type w
movies.title = movies.title.str[:-7]
# Categorize movies genres properly. Working later with +20MM rows of strings pro
genres_unique = pd.DataFrame(movies.genres.str.split('|').tolist()).stack().uniqu
genres_unique = pd.DataFrame(genres_unique, columns=['genre']) # Format into Data
movies = movies.join(movies.genres.str.get_dummies().astype(bool))
movies.drop('genres', inplace=True, axis=1)
runtime = default_timer() - st
print ("Elapsed time(sec): ", round(runtime,2))
userId int64
movieId int64
rating float64
timestamp int64
dtype: object
Number of movies Null values: 17
Number of ratings Null values: 0
Elapsed time(sec): 39.85
localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 2/6
8/12/2019 MovieLens_DataAnalysis
In [3]: st = default_timer()
# Let's work with a temp smaller slice 'dftmp' of the original dataframe to reduc
dftmp = movies[['movieId', 'year']].groupby('year')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of movies released'); ax2.set_ylabel('Number of ratings')
plt.title('Movies per year')
plt.show()
# Housekeeping
%reset_selective -f (^dftmp$|^ax1$|^ax2$)
runtime = default_timer() - st
print ("Elapsed time(sec): ", round(runtime,2))
In [ ]:
In [ ]:
localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 3/6
8/12/2019 MovieLens_DataAnalysis
In [4]: st = default_timer()
plt.figure(figsize=(10,5))
dftmp = movies[['movieId', 'year']].groupby('year')
df = pd.DataFrame({'All_movies' : dftmp.movieId.nunique().cumsum()})
# Plot histogram for each individual genre
for genre in genres_unique.genre:
dftmp = movies[movies[genre]][['movieId', 'year']].groupby('year')
df[genre]=dftmp.movieId.nunique().cumsum()
df.fillna(method='ffill', inplace=True)
df.loc[:,df.columns!='All_movies'].plot.area(stacked=True, figsize=(10,5))
# Plot histogram for all movies
plt.plot(df['All_movies'], marker='o', markerfacecolor='black')
plt.xlabel('Year')
plt.ylabel('Cumulative number of movies-genre')
plt.title('Total movies-genre') # Many movies have multiple genres, so counthere
plt.legend(loc=(1.05,0), ncol=2)
plt.show()
# Plot simple scatter of the number of movies tagged with each genre
plt.figure(figsize=(15,5))
barlist = df.iloc[-1].plot.bar()
barlist.patches[0].set_color('b') # Color 'All_movies' differently, as it's not a
plt.xticks(rotation='vertical')
plt.title('Movies per genre tag')
plt.xlabel('Genre')
plt.ylabel('Number of movies tagged')
plt.show()
# Housekeeping
%reset_selective -f (^barlist$|^dftmp$|^genre$)
runtime = default_timer() - st
print ("Elapsed time(sec): ", round(runtime,2))
localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 4/6
8/12/2019 MovieLens_DataAnalysis
In [ ]:
localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 5/6
8/12/2019 MovieLens_DataAnalysis
In [5]: st = default_timer()
dftmp = ratings[['movieId','rating']].groupby('movieId').mean()
# Housekeeping
%reset_selective -f (^avg$|^dftmp$|^genre$|^std$)
runtime = default_timer() - st
print ("Elapsed time(sec): ", round(runtime,2))
C:\Users\a.venkitachalam\AppData\Local\Continuum\anaconda3\lib\site-packages
\matplotlib\axes\_axes.py:6521: MatplotlibDeprecationWarning:
The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.
1. Use 'density' instead.
alternative="'density'", removal="3.1")
localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 6/6