|
| 1 | +# Pyspark ALS Model Applications |
| 2 | +Notes here are from a Course in [DataCamp](https://campus.datacamp.com/courses/recommendation-engines-in-pyspark). |
| 3 | + |
| 4 | +## Recommendation System with PySpark |
| 5 | + |
| 6 | +### Toy 1 |
| 7 | +Suppose we have pyspark dataframe called ratings: |
| 8 | +```bash |
| 9 | +In [1]: ratings.show(5) |
| 10 | ++------+-------+------+ |
| 11 | +|userId|movieId|rating| |
| 12 | ++------+-------+------+ |
| 13 | +| 2| 3| 3.0| |
| 14 | +| 2| 1| 4.0| |
| 15 | +| 2| 2| 4.0| |
| 16 | +| 2| 0| 3.0| |
| 17 | +| 0| 3| 4.0| |
| 18 | ++------+-------+------+ |
| 19 | +only showing top 5 rows |
| 20 | +``` |
| 21 | + |
| 22 | +```python |
| 23 | +# Split the ratings dataframe into training and test data |
| 24 | +(training_data, test_data) = ratings.randomSplit([0.8, 0.2], seed=42) |
| 25 | + |
| 26 | +# Set the ALS hyperparameters |
| 27 | +from pyspark.ml.recommendation import ALS |
| 28 | +als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", rank =10, maxIter =15, regParam =0.1, |
| 29 | + coldStartStrategy="drop", nonnegative =True, implicitPrefs = False) |
| 30 | + |
| 31 | +# Fit the mdoel to the training_data |
| 32 | +model = als.fit(training_data) |
| 33 | + |
| 34 | +# Generate predictions on the test_data |
| 35 | +test_predictions = model.transform(test_data) |
| 36 | +test_predictions.show() |
| 37 | + |
| 38 | +# Import RegressionEvaluator |
| 39 | +from pyspark.ml.evaluation import RegressionEvaluator |
| 40 | + |
| 41 | +# Complete the evaluator code |
| 42 | +evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") |
| 43 | + |
| 44 | +# Extract the 3 parameters |
| 45 | +print(evaluator.getMetricName()) |
| 46 | +print(evaluator.getLabelCol()) |
| 47 | +print(evaluator.getPredictionCol()) |
| 48 | + |
| 49 | +# Evaluate the "test_predictions" dataframe |
| 50 | +RMSE = evaluator.evaluate(test_predictions) |
| 51 | + |
| 52 | +# Print the RMSE |
| 53 | +print (RMSE) |
| 54 | +``` |
| 55 | + |
| 56 | +### Toy2 |
| 57 | +```python |
| 58 | +# Look at the column names |
| 59 | +print(ratings.columns) |
| 60 | + |
| 61 | +# Look at the first few rows of data |
| 62 | +print(ratings.show()) |
| 63 | +``` |
| 64 | + |
| 65 | +The output is like this: |
| 66 | +```bash |
| 67 | +['userId', 'movieId', 'rating', 'timestamp'] |
| 68 | + +------+-------+------+----------+ |
| 69 | + |userId|movieId|rating| timestamp| |
| 70 | + +------+-------+------+----------+ |
| 71 | + | 1| 31| 2.5|1260759144| |
| 72 | + | 1| 1029| 3.0|1260759179| |
| 73 | + | 1| 1061| 3.0|1260759182| |
| 74 | + | 1| 1129| 2.0|1260759185| |
| 75 | + | 1| 1172| 4.0|1260759205| |
| 76 | + | 1| 1263| 2.0|1260759151| |
| 77 | + | 1| 1287| 2.0|1260759187| |
| 78 | + | 1| 1293| 2.0|1260759148| |
| 79 | + | 1| 1339| 3.5|1260759125| |
| 80 | + | 1| 1343| 2.0|1260759131| |
| 81 | + | 1| 1371| 2.5|1260759135| |
| 82 | + | 1| 1405| 1.0|1260759203| |
| 83 | + | 1| 1953| 4.0|1260759191| |
| 84 | + | 1| 2105| 4.0|1260759139| |
| 85 | + | 1| 2150| 3.0|1260759194| |
| 86 | + | 1| 2193| 2.0|1260759198| |
| 87 | + | 1| 2294| 2.0|1260759108| |
| 88 | + | 1| 2455| 2.5|1260759113| |
| 89 | + | 1| 2968| 1.0|1260759200| |
| 90 | + | 1| 3671| 3.0|1260759117| |
| 91 | + +------+-------+------+----------+ |
| 92 | + only showing top 20 rows |
| 93 | +``` |
| 94 | + |
| 95 | +Calculate Sparsity: |
| 96 | + |
| 97 | +```python |
| 98 | +# Count the total number of ratings in the dataset |
| 99 | +numerator = ratings.select("rating").count() |
| 100 | + |
| 101 | +# Count the number of distinct userIds and distinct movieIds |
| 102 | +num_users = ratings.select("userId").distinct().count() |
| 103 | +num_movies = ratings.select("movieId").distinct().count() |
| 104 | + |
| 105 | +# Set the denominator equal to the number of users multiplied by the number of movies |
| 106 | +denominator = num_users * num_movies |
| 107 | + |
| 108 | +# Divide the numerator by the denominator |
| 109 | +sparsity = (1.0 - (numerator *1.0)/denominator)*100 |
| 110 | +print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.") |
| 111 | +``` |
| 112 | + |
| 113 | +```bash |
| 114 | +The ratings dataframe is 98.36% empty. |
| 115 | +``` |
| 116 | + |
| 117 | +Explore the dataset: |
| 118 | +```python |
| 119 | +# Import the requisite packages |
| 120 | +from pyspark.sql.functions import col |
| 121 | + |
| 122 | +# View the ratings dataset |
| 123 | +ratings.show() |
| 124 | + |
| 125 | +# Filter to show only userIds less than 100 |
| 126 | +ratings.filter(col("userId") < 100).show() |
| 127 | + |
| 128 | +# Group data by userId, count ratings |
| 129 | +ratings.groupBy("userId").count().show() |
| 130 | +``` |
| 131 | + |
| 132 | + |
0 commit comments