import numpy as np
import pandas as pd
import json
from elasticsearch import Elasticsearch
from jproperties import Properties
import matplotlib.pyplot as plt
from IPython.display import display, Image
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def create_df_from_ES_index(yelp_ES_config_path,ES_index_name):
# Connect to Elasticsearch using Cloud_id and password stored in properties file
configs = Properties()
with open(yelp_ES_config_path, 'rb') as config_file:
configs.load(config_file)
client = Elasticsearch(cloud_id=configs.get('Cloud_ID').data,basic_auth=("elastic", configs.get('Elastic_pw').data))
# client.info()
# Get the first patch of document search
resp = client.search(index = ES_index_name, query={"match_all": {}}, size = 10000)
hits_meta = resp['hits']['hits']
# Get all documents
# Work for all cases when there are more than 10000 restaurants in that city/state
while resp['hits']['total']['relation'] == 'gte':
max_ts = max([i['_source']['timestamp'] for i in resp['hits']['hits']])
resp = client.search(index = ES_index_name, query={"range": {"timestamp":{"gte":max_ts}}}, size = 10000)
hits_meta.extend(resp['hits']['hits'])
total_restaurants = len(hits_meta)
# Create a dataframe with all source info for all restaurants in San Francisco
df = pd.DataFrame()
for i in range(total_restaurants):
df = df.append(pd.json_normalize(hits_meta[i]['_source']))
df = df.reset_index(drop=True)
return df
# Create dataframe after connecting to Elasticsearch given the index name
df = create_df_from_ES_index('/Users/dannie/Documents/Projects/restaurant-recommendation-system/Yelp_ES_config.properties','yelp-fusion-restaurants-sf')
def data_cleaning(df, city):
#On the df,price for $$$$ restaurants are not showing =>re-assign
df.loc[df['price']=='$$$$','price'] = '$$$$'
#Re-assign restaurants with NA price to Unknown
df.loc[df['price'].isna(),'price'] = 'Unknown'
#Remove restaurant with empty categories
df = df[df['categories'].apply(len) != 0]
# Convert the list of dictionaries with alias and titles into the list of all categories that the restaurant has using the title values
df['categories'] = df['categories'].apply(lambda x: pd.DataFrame.from_dict(x)['title'].to_list())
# Remove the mismatched transactions to only include options for delivery, pickup, or restaurant reservation
df['transactions'] = df['transactions'].apply(lambda y: list(filter(lambda x: x in ['delivery','pickup','restaurant_reservation'], y)))
# Convert the list of address dispplayed into a string
df['location.display_address'] = df['location.display_address'].apply(lambda x: ', '.join(map(str, x)))
# Drop some un-used columns - some columns has specific values to when the doc was indexed on Elasticsearch
df = df.drop(['hours','phone','is_closed','timestamp', 'messaging.url', 'messaging.use_case_text'], axis=1)
# Convert different transaction categories into columns and join it with the original dataframe
transaction_df = pd.get_dummies(df['transactions'].explode()).groupby(level=0).sum()
df = df.join(transaction_df)
# Only get the restaurants with price '$','$$','$$$', or'$$$$'
df = df[df.price.isin(['$','$$','$$$','$$$$','Unknown'])]
# For the restaurants with nan state (location), extract the state from the display_address
df.loc[df['location.state'].isna(),'location.state'] = df[df['location.state'].isna()]['location.display_address'].str.split(', ').apply(lambda x: x[-1].split(' ')[0])
# Only get the restaurants within the city
df = df[df['location.city'] == city]
# Only get the restaurants that has the zipcode with number(digits) pattern
df = df[df['location.zip_code'].str.isdigit()]
return df.reset_index(drop=True)
df = data_cleaning(df,'San Francisco')
def show_image(name):
restaurant = df[df.name == name].reset_index(drop=True)
for i in range(len(restaurant)):
print('Restaurant:',name,'\t Location:',restaurant.loc[i,'location.display_address'])
[display(Image(pics)) for pics in restaurant.loc[i,'photos']]
show_image("Brenda's French Soul Food")
Restaurant: Brenda's French Soul Food Location: 652 Polk St, San Francisco, CA 94102
show_image("Tartine Bakery")
Restaurant: Tartine Bakery Location: 600 Guerrero St, San Francisco, CA 94110
df.sort_values(by='review_count',ascending=False).head()[['name','review_count','categories','rating','display_phone','price','location.display_address','delivery','pickup']]
name | review_count | categories | rating | display_phone | price | location.display_address | delivery | pickup | |
---|---|---|---|---|---|---|---|---|---|
2992 | Brenda's French Soul Food | 11898 | [Breakfast & Brunch, Southern, Cajun/Creole] | 4.0 | (415) 345-8100 | $$ | 652 Polk St, San Francisco, CA 94102 | 1 | 0 |
594 | Fog Harbor Fish House | 8714 | [Seafood, Wine Bars, Cocktail Bars] | 4.5 | (415) 969-2010 | $$ | 39 Pier, Ste a - 202, San Francisco, CA 94133 | 0 | 0 |
1421 | Tartine Bakery | 8664 | [Bakeries, Cafes, Desserts] | 4.0 | (415) 487-2600 | $$ | 600 Guerrero St, San Francisco, CA 94110 | 1 | 0 |
3251 | House of Prime Rib | 8306 | [American (Traditional), Steakhouses, Wine Bars] | 4.0 | (415) 885-4605 | $$$ | 1906 Van Ness Ave, San Francisco, CA 94109 | 0 | 0 |
2148 | San Tung | 7818 | [Chinese, Chicken Wings, Noodles] | 4.0 | (415) 242-0828 | $$ | 1031 Irving St, San Francisco, CA 94122 | 1 | 0 |
df.groupby('rating').name.agg(['count']).reset_index(drop=False)
rating | count | |
---|---|---|
0 | 1.0 | 64 |
1 | 1.5 | 29 |
2 | 2.0 | 99 |
3 | 2.5 | 146 |
4 | 3.0 | 316 |
5 | 3.5 | 826 |
6 | 4.0 | 1425 |
7 | 4.5 | 886 |
8 | 5.0 | 285 |
fig = px.bar(df.groupby('rating').name.agg(['count']).reset_index(drop=False)
, x="rating", y="count", color="rating", text_auto=True, title = "Total Restaurants by Yelp Ratings")
fig.update_layout(xaxis_title = "Rating",yaxis_title = "Total Restaurants")
fig.show()
# Get the restaurant counts for all categories
# Note: 1 restaurant could have multiple categories
cat = df['categories'].apply(pd.Series).stack().reset_index(drop=True).value_counts()
cat = cat.to_frame(name='count').reset_index().rename(columns={'index':'category'})
fig = go.Figure(data=[go.Pie(labels=cat.head(10).category, values=cat.head(10)['count'],hole = 0.3,
pull = [0.2,0,0,0,0,0,0,0,0,0])])
fig.update_layout(title = 'Top 10 categories')
fig.show()
fig = px.scatter_mapbox(df, lat="coordinates.latitude", lon="coordinates.longitude", color="location.zip_code",
zoom=11, hover_name = "name", hover_data = ['rating','review_count','transactions','categories'])
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(title = 'SF Restaurant Locations')
fig.show()
df.groupby(['location.zip_code','rating']).name.agg(['count']).reset_index(drop=False)
location.zip_code | rating | count | |
---|---|---|---|
0 | 34016 | 4.5 | 1 |
1 | 90012 | 3.0 | 1 |
2 | 94013 | 3.5 | 1 |
3 | 94016 | 3.5 | 1 |
4 | 94016 | 5.0 | 2 |
... | ... | ... | ... |
220 | 94199 | 5.0 | 1 |
221 | 94214 | 5.0 | 1 |
222 | 94588 | 4.5 | 1 |
223 | 94619 | 2.5 | 1 |
224 | 94619 | 3.0 | 1 |
225 rows × 3 columns
fig = px.sunburst(df.groupby(['location.zip_code','rating']).name.agg(['count']).reset_index(drop=False), path=['location.zip_code','rating'], values='count', title = 'Total Restaurants by Zipcode and Rating')
fig.show()
delivery_pickup = df.groupby('location.zip_code').agg({'delivery':'sum','pickup':'sum'}).reset_index(drop=False)
delivery_pickup[delivery_pickup['location.zip_code'].isin(['94110','94103','94102','94109','94133'])]
location.zip_code | delivery | pickup | |
---|---|---|---|
5 | 94102 | 193.0 | 135 |
6 | 94103 | 233.0 | 181 |
12 | 94109 | 231.0 | 171 |
13 | 94110 | 308.0 | 235 |
30 | 94133 | 186.0 | 138 |
fig = px.bar(delivery_pickup[delivery_pickup['location.zip_code'].isin(['94110','94103','94102','94109','94133'])],
x=['delivery','pickup'], y="location.zip_code", orientation='h', barmode='group',title='Total Restaurants by Delivery/Pickup Service for Top 5 Zipcodes With The Most Restaurants',
labels = {'location.zip_code':'Zipcode','variable':'Type','value':'Total Restaurants'})
fig.show()
rest_by_price = df.groupby('price').name.agg(['count']).reset_index(drop=False).sort_values(by='count',ascending=False)
fig = px.treemap(rest_by_price, path=['price'], values='count', title = 'Total Restaurants by Price')
fig.update_traces(root_color="lightgrey")
fig.show()
# This function returns the updated dataframe with the name of restaurants and the final content
# that is the combination of many conditions that we want the recommendation system to consider
#Final content column is cleaned properly for the pre-prosessing step before building the recommendation
def create_final_content(similar_conditions):
data = df.copy()
# New mapping for price
data['price'] = data['price'].map({'$':'$','$$':'2$','$$$':'3$','$$$$':'4$','Unknown':'unknown'})
# Update transaction to exclude special character & space (only _ in this case)
# Then join them together into 1 big string
data['transactions']= data['transactions'].apply(lambda x: ' '.join([i.replace('_','') for i in x]))
# Update review_count with new group bucket by review count bins
group_bin = [1, 100, 200, 400, 600, 800, 1000, 2000, np.inf]
group_name = ['group1','group2','group3','group4','group5','group6','group7','group8']
data['review_count'] = pd.cut(data.review_count, bins = group_bin, labels = group_name, include_lowest = True)
# Update ratings with new rating group bin
data['rating'] = pd.cut(data['rating'], bins = [1,2,3,4,5,np.inf], labels = ['r1','r2','r3','r4','r5'], include_lowest = True,right=False)
# Update categories to be all lower cases and remove space
# Then join them together into 1 big string
data['categories'] = data['categories'].apply(lambda x: ' '.join([i.lower().replace(' ','') for i in x]))
# Create a new column by combining the values for categories, review count, rating, transactions, price
#Different conditions based on whether the we have 1 single condition (string format) or multiple conditions (list)
data['final_content'] = data[similar_conditions].apply(lambda x: ' '.join(x.astype(str)), axis=1) if isinstance(similar_conditions,list) else data[similar_conditions]
# Drop duplicates for restaurants based on name and contents that we want to compare the restaurant with
data = data[['name','final_content']].drop_duplicates().reset_index(drop=True)
return data
def build_Yelp_recommendation_system(similar_conditions, restaurant_name, top_n):
# Create the update df with the final content column based on similar conditions
data = create_final_content(similar_conditions)
# Create the count sparse matrix for the content column and remove stop words in case there are extra unnecessary strings
cnt_matrix = CountVectorizer(stop_words='english').fit_transform(data['final_content'])
# Calculate cosine similarity score
cosine_sim = cosine_similarity(cnt_matrix)
# Get the chosen restaurant's index based on name
restaurant_ind = data[data.name == restaurant_name].index[0]
# Get the list of restaurant index and its cosine similarity score as tuple
similar_list = list(enumerate(cosine_sim[restaurant_ind]))
# Sort this list based on cosine similarity score from high to low
highest_similar_list = sorted(similar_list,key=lambda x:x[1],reverse=True)
# There are cases where there are extremely good matches that the recommended restaurant has same cosine similarity score with the chosen restaurant (=1)
# Because the list is sorted based on the cosine similarities score and not restaurant name, it could have different index instead of being the first index
# Therefore we will exclude that restaurant name out of the list and get the top n recommended restaurant
#highest_similar_list.remove((restaurant_ind, 1.0))
highest_similar_list = [item for item in highest_similar_list if item[0]!= restaurant_ind]
print('If the restaurant name shows up in the recommended list more than once, it means that while the restaurant has the same name, it has different conditions listed compared to the other location.')
print('Below is a list of top',top_n, 'recommended restaurants based on',', '.join(similar_conditions) if isinstance(similar_conditions,list) else similar_conditions, 'compared to',restaurant_name+':')
[print(data.name.loc[i[0]]) for i in highest_similar_list[:top_n]]
build_Yelp_recommendation_system(['price','categories','rating','review_count'], 'San Tung',10)
If the restaurant name shows up in the recommended list more than once, it means that while the restaurant has the same name, it has different conditions listed compared to the other location. Below is a list of top 10 recommended restaurants based on price, categories, rating, review_count compared to San Tung: Mensho Tokyo SF Marugame Udon Nan King China Bistro Yuanbao Jiaozi Sichuan Tasty Restaurant Guilin Rice Noodles House Izakaya Sozai Thanh Long Baan Yaai Yummy Yummy
build_Yelp_recommendation_system('categories','Arsicault',10)
If the restaurant name shows up in the recommended list more than once, it means that while the restaurant has the same name, it has different conditions listed compared to the other location. Below is a list of top 10 recommended restaurants based on categories compared to Arsicault: Devil’s Teeth Baking Company Eterna Primavera Bakery Devil's Teeth Baking Company Breadbelly Tartine Inner Sunset Vive La Tarte Universal Bakery Heavenly Bakery Mama's On Washington Square Home Plate