Hands-On Project: Movie Recommendation System
Difficulty Level: Intermediate | Duration: 3-5 hours | Topics: Collaborative Filtering, Machine Learning, Data Mining
Related Learning Paths
Learning Objectives
- Understand collaborative filtering algorithms
- Implement user-based and item-based recommendations
- Build a matrix factorization model
- Evaluate recommendation quality
- Handle sparse data in recommendation systems
Prerequisites
- Python 3.8+
- Pandas, NumPy, Scikit-learn
- Understanding of matrix operations
- Basic knowledge of algorithms
Step 1: Setup & Data Loading
1.1 Install Libraries
pip install pandas numpy scikit-learn matplotlib seaborn scipy
1.2 Load Dataset
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
# Load MovieLens dataset (small version)
movies = pd.read_csv('https://grouplens.org/datasets/movielens/latest-small-zip', sep=',')
# or use a local file
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
print(f"Ratings shape: {ratings.shape}")
print(f"Movies shape: {movies.shape}")
print(f"nFirst few ratings:n{ratings.head()}")
print(f"nFirst few movies:n{movies.head()}")
Step 2: Data Preprocessing
2.1 Explore Data
# Check data statistics
print("Rating Statistics:")
print(ratings['rating'].describe())
print(f"nUnique users: {ratings['userId'].nunique()}")
print(f"Unique movies: {ratings['movieId'].nunique()}")
print(f"Sparsity: {1 - (len(ratings) / (ratings['userId'].nunique() * ratings['movieId'].nunique())):.2%}")
# Most rated movies
popular_movies = ratings.groupby('movieId')['rating'].agg(['mean', 'count']).sort_values('count', ascending=False)
print(f"nTop 10 rated movies:")
print(popular_movies.head(10))
2.2 Create User-Item Matrix
# Create user-item rating matrix
user_item_matrix = ratings.pivot_table(
index='userId',
columns='movieId',
values='rating',
fill_value=0
)
print(f"User-Item Matrix shape: {user_item_matrix.shape}")
print(f"Matrix sparsity: {(user_item_matrix == 0).sum().sum() / user_item_matrix.size:.2%}")
Step 3: Implement Collaborative Filtering
3.1 User-Based Collaborative Filtering
class UserBasedCF:
def __init__(self, user_item_matrix):
self.user_item_matrix = user_item_matrix
# Compute similarity matrix
self.user_similarity = pd.DataFrame(
cosine_similarity(user_item_matrix),
index=user_item_matrix.index,
columns=user_item_matrix.index
)
def recommend(self, user_id, n_recommendations=5, n_similar_users=10):
# Find similar users
similar_users = self.user_similarity[user_id].sort_values(ascending=False)[1:n_similar_users+1]
# Get ratings from similar users
similar_users_ratings = self.user_item_matrix.loc[similar_users.index]
# Weighted average based on similarity
weighted_ratings = similar_users_ratings.T.dot(similar_users.values)
weighted_ratings = weighted_ratings / similar_users.sum()
# Get user's already rated movies
user_rated = self.user_item_matrix.loc[user_id] > 0
# Remove already rated movies
recommendations = weighted_ratings[~user_rated].sort_values(ascending=False)
return recommendations.head(n_recommendations)
# Example usage
user_cf = UserBasedCF(user_item_matrix)
recommendations = user_cf.recommend(user_id=1, n_recommendations=5)
print("Recommended movies for user 1:")
print(recommendations)
3.2 Item-Based Collaborative Filtering
class ItemBasedCF:
def __init__(self, user_item_matrix):
self.user_item_matrix = user_item_matrix
# Compute item similarity
self.item_similarity = pd.DataFrame(
cosine_similarity(user_item_matrix.T),
index=user_item_matrix.columns,
columns=user_item_matrix.columns
)
def recommend(self, user_id, n_recommendations=5):
# Get items rated by user
user_ratings = self.user_item_matrix.loc[user_id]
rated_items = user_ratings[user_ratings > 0]
# Get similar items
similar_items_scores = pd.Series(0, index=self.user_item_matrix.columns)
for item_id, rating in rated_items.items():
similar_items = self.item_similarity[item_id]
similar_items_scores += similar_items * rating
# Remove already rated items
recommendations = similar_items_scores[~user_ratings.astype(bool)].sort_values(ascending=False)
return recommendations.head(n_recommendations)
# Example usage
item_cf = ItemBasedCF(user_item_matrix)
recommendations = item_cf.recommend(user_id=1, n_recommendations=5)
print("Item-based recommendations for user 1:")
print(recommendations)
Step 4: Matrix Factorization
4.1 Implement Singular Value Decomposition (SVD)
class SVDRecommender:
def __init__(self, user_item_matrix, n_factors=50):
self.user_item_matrix = user_item_matrix
self.n_factors = n_factors
# Apply SVD
self.svd = TruncatedSVD(n_components=n_factors, random_state=42)
self.user_factors = self.svd.fit_transform(user_item_matrix)
self.item_factors = self.svd.components_.T
# Reconstruct matrix
self.predicted_ratings = np.dot(self.user_factors, self.item_factors.T)
self.predicted_matrix = pd.DataFrame(
self.predicted_ratings,
index=user_item_matrix.index,
columns=user_item_matrix.columns
)
def recommend(self, user_id, n_recommendations=5):
# Get predicted ratings for user
user_predictions = self.predicted_matrix.loc[user_id]
# Get already rated items
user_rated = self.user_item_matrix.loc[user_id] > 0
# Recommend unrated items with highest predicted ratings
recommendations = user_predictions[~user_rated].sort_values(ascending=False)
return recommendations.head(n_recommendations)
# Example usage
svd_recommender = SVDRecommender(user_item_matrix, n_factors=50)
recommendations = svd_recommender.recommend(user_id=1, n_recommendations=5)
print("SVD-based recommendations for user 1:")
print(recommendations)
Step 5: Evaluate Recommendations
5.1 Train-Test Split
from sklearn.model_selection import train_test_split
# Split ratings
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)
# Create train and test matrices
train_matrix = train_ratings.pivot_table(
index='userId',
columns='movieId',
values='rating',
fill_value=0
)
test_matrix = test_ratings.pivot_table(
index='userId',
columns='movieId',
values='rating',
fill_value=0
)
print(f"Train set size: {len(train_ratings)}")
print(f"Test set size: {len(test_ratings)}")
5.2 Evaluation Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
def evaluate_recommender(recommender, test_ratings, top_n=5):
"""
Evaluate recommendation system
"""
errors = []
hits = 0
total = 0
for user_id in test_ratings['userId'].unique():
# Get test items for user
test_items = test_ratings[test_ratings['userId'] == user_id]
if len(test_items) == 0:
continue
# Get recommendations
recommendations = recommender.recommend(user_id, n_recommendations=top_n)
# Check hits
for _, row in test_items.iterrows():
total += 1
if row['movieId'] in recommendations.index:
hits += 1
errors.append(abs(row['rating'] - recommendations[row['movieId']]))
# Calculate metrics
mae = np.mean(errors) if errors else 0
rmse = np.sqrt(np.mean(np.array(errors)**2)) if errors else 0
hit_rate = hits / total if total > 0 else 0
return {
'MAE': mae,
'RMSE': rmse,
'Hit Rate': hit_rate
}
# Evaluate different models
svd_recommender = SVDRecommender(train_matrix, n_factors=50)
metrics = evaluate_recommender(svd_recommender, test_ratings)
print("SVD Recommender Metrics:")
for metric, value in metrics.items():
print(f" {metric}: {value:.4f}")
Step 6: Get Movie Names in Recommendations
6.1 Map IDs to Titles
def get_recommendations_with_titles(recommender, user_id, n_recommendations=5):
"""
Get recommendations with movie titles
"""
# Get recommendations
recommendations = recommender.recommend(user_id, n_recommendations)
# Map to titles
result = []
for movie_id, score in recommendations.items():
title = movies[movies['movieId'] == movie_id]['title'].values[0]
result.append({
'movieId': movie_id,
'title': title,
'predicted_rating': score
})
return pd.DataFrame(result)
# Example usage
print("Recommendations for User 1:")
print(get_recommendations_with_titles(svd_recommender, user_id=1, n_recommendations=5))
Step 7: Hybrid Recommendation System
7.1 Combine Multiple Methods
class HybridRecommender:
def __init__(self, user_item_matrix):
self.user_based = UserBasedCF(user_item_matrix)
self.item_based = ItemBasedCF(user_item_matrix)
self.svd_based = SVDRecommender(user_item_matrix)
def recommend(self, user_id, n_recommendations=5, weights=None):
if weights is None:
weights = {'user': 0.3, 'item': 0.3, 'svd': 0.4}
# Get recommendations from each model
user_recs = self.user_based.recommend(user_id, n_recommendations*3)
item_recs = self.item_based.recommend(user_id, n_recommendations*3)
svd_recs = self.svd_based.recommend(user_id, n_recommendations*3)
# Combine with weights
all_items = set(user_recs.index) | set(item_recs.index) | set(svd_recs.index)
hybrid_scores = pd.Series(0.0, index=all_items)
for item in all_items:
user_score = user_recs.get(item, 0) * weights['user']
item_score = item_recs.get(item, 0) * weights['item']
svd_score = svd_recs.get(item, 0) * weights['svd']
hybrid_scores[item] = user_score + item_score + svd_score
return hybrid_scores.sort_values(ascending=False).head(n_recommendations)
# Example usage
hybrid = HybridRecommender(train_matrix)
recommendations = hybrid.recommend(user_id=1, n_recommendations=5)
print("Hybrid recommendations:")
print(recommendations)