Anda di halaman 1dari 6

8/12/2019 MovieLens_DataAnalysis

In [1]: from timeit import default_timer


start = default_timer()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 1/6
8/12/2019 MovieLens_DataAnalysis

In [2]: st = default_timer()

# First time data load.


movies = pd.read_csv('C:/Users/a.venkitachalam/Downloads/ml-20m/ml-20m/movies.csv
ratings = pd.read_csv('C:/Users/a.venkitachalam/Downloads/ml-20m/ml-20m/ratings.c

# Organise a bit and store into feather-format


movies.sort_values(by='movieId', inplace=True)
movies.reset_index(inplace=True, drop=True)
ratings.sort_values(by='movieId', inplace=True)
ratings.reset_index(inplace=True, drop=True)

print(ratings.dtypes)

# Split title and release year in separate columns in movies dataframe. Convert y
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type w
movies.title = movies.title.str[:-7]

# Categorize movies genres properly. Working later with +20MM rows of strings pro
genres_unique = pd.DataFrame(movies.genres.str.split('|').tolist()).stack().uniqu
genres_unique = pd.DataFrame(genres_unique, columns=['genre']) # Format into Data
movies = movies.join(movies.genres.str.get_dummies().astype(bool))
movies.drop('genres', inplace=True, axis=1)

# Modify rating timestamp format (from seconds to datetime year)


#ratings.timestamp = pd.to_datetime(ratings.timestamp, unit='s')
ratings.timestamp = pd.to_datetime(ratings.timestamp, infer_datetime_format=True)
ratings.timestamp = ratings.timestamp.dt.year

# Check and clean NaN values


print ("Number of movies Null values: ", max(movies.isnull().sum()))
print ("Number of ratings Null values: ", max(ratings.isnull().sum()))
movies.dropna(inplace=True)
ratings.dropna(inplace=True)

# Organise a bit, then save into feather-formatand clear from memory


movies.sort_values(by='movieId', inplace=True)
ratings.sort_values(by='movieId', inplace=True)
movies.reset_index(inplace=True, drop=True)
ratings.reset_index(inplace=True, drop=True)

runtime = default_timer() - st
print ("Elapsed time(sec): ", round(runtime,2))

userId int64
movieId int64
rating float64
timestamp int64
dtype: object
Number of movies Null values: 17
Number of ratings Null values: 0
Elapsed time(sec): 39.85

localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 2/6
8/12/2019 MovieLens_DataAnalysis

In [3]: st = default_timer()

# Let's work with a temp smaller slice 'dftmp' of the original dataframe to reduc
dftmp = movies[['movieId', 'year']].groupby('year')

fig, ax1 = plt.subplots(figsize=(10,5))


ax1.plot(dftmp.year.first(), dftmp.movieId.nunique(), "g-o")
ax1.grid(None)
ax1.set_ylim(0,)

dftmp = ratings[['rating', 'timestamp']].groupby('timestamp')


ax2 = ax1.twinx()
ax2.plot(dftmp.timestamp.first(), dftmp.rating.count(), "r-o")
ax2.grid(None)
ax2.set_ylim(0,)

ax1.set_xlabel('Year')
ax1.set_ylabel('Number of movies released'); ax2.set_ylabel('Number of ratings')
plt.title('Movies per year')
plt.show()

# Housekeeping
%reset_selective -f (^dftmp$|^ax1$|^ax2$)

runtime = default_timer() - st
print ("Elapsed time(sec): ", round(runtime,2))

Elapsed time(sec): 1.85

In [ ]:

In [ ]:

localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 3/6
8/12/2019 MovieLens_DataAnalysis

In [4]: st = default_timer()

plt.figure(figsize=(10,5))
dftmp = movies[['movieId', 'year']].groupby('year')
df = pd.DataFrame({'All_movies' : dftmp.movieId.nunique().cumsum()})
# Plot histogram for each individual genre
for genre in genres_unique.genre:
dftmp = movies[movies[genre]][['movieId', 'year']].groupby('year')
df[genre]=dftmp.movieId.nunique().cumsum()
df.fillna(method='ffill', inplace=True)
df.loc[:,df.columns!='All_movies'].plot.area(stacked=True, figsize=(10,5))
# Plot histogram for all movies
plt.plot(df['All_movies'], marker='o', markerfacecolor='black')
plt.xlabel('Year')
plt.ylabel('Cumulative number of movies-genre')
plt.title('Total movies-genre') # Many movies have multiple genres, so counthere
plt.legend(loc=(1.05,0), ncol=2)
plt.show()
# Plot simple scatter of the number of movies tagged with each genre
plt.figure(figsize=(15,5))
barlist = df.iloc[-1].plot.bar()
barlist.patches[0].set_color('b') # Color 'All_movies' differently, as it's not a
plt.xticks(rotation='vertical')
plt.title('Movies per genre tag')
plt.xlabel('Genre')
plt.ylabel('Number of movies tagged')
plt.show()

# Housekeeping
%reset_selective -f (^barlist$|^dftmp$|^genre$)

runtime = default_timer() - st
print ("Elapsed time(sec): ", round(runtime,2))

<Figure size 720x360 with 0 Axes>

localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 4/6
8/12/2019 MovieLens_DataAnalysis

Elapsed time(sec): 2.09

In [ ]:

localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 5/6
8/12/2019 MovieLens_DataAnalysis

In [5]: st = default_timer()

dftmp = ratings[['movieId','rating']].groupby('movieId').mean()

# Initialize empty list to capture basic stats by gere


rating_stats = []
# Plot general histogram of all ratings
dftmp.hist(bins=25, grid=False, edgecolor='b', normed=True, label ='All genres',
# Plot histograms (kde lines for better visibility) per genre
for genre in genres_unique.genre:
dftmp = movies[movies[genre]==True]
dftmp = ratings[ratings.set_index('movieId').index.isin(dftmp.set_index('movi
dftmp = dftmp[['movieId','rating']].groupby('movieId').mean()
dftmp.rating.plot(grid=False, alpha=0.6, kind='kde', label=genre)
avg = dftmp.rating.mean()
std = dftmp.rating.std()
rating_stats.append((genre, avg, std))
plt.legend(loc=(1.05,0), ncol=2)
plt.xlim(0,5)
plt.xlabel('Movie rating')
plt.title('Movie rating histograms')
plt.show()

# Housekeeping
%reset_selective -f (^avg$|^dftmp$|^genre$|^std$)

runtime = default_timer() - st
print ("Elapsed time(sec): ", round(runtime,2))

C:\Users\a.venkitachalam\AppData\Local\Continuum\anaconda3\lib\site-packages
\matplotlib\axes\_axes.py:6521: MatplotlibDeprecationWarning:
The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.
1. Use 'density' instead.
alternative="'density'", removal="3.1")

localhost:8888/notebooks/Downloads/MovieLens_DataAnalysis.ipynb 6/6

Anda mungkin juga menyukai