import numpy as np
import pandas as pd
import json
from elasticsearch import Elasticsearch
from jproperties import Properties
import matplotlib.pyplot as plt
from IPython.display import display, Image
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def create_df_from_ES_index(yelp_ES_config_path,ES_index_name):
    # Connect to Elasticsearch using Cloud_id and password stored in properties file
    configs = Properties()
    with open(yelp_ES_config_path, 'rb') as config_file:
        configs.load(config_file)
    client = Elasticsearch(cloud_id=configs.get('Cloud_ID').data,basic_auth=("elastic", configs.get('Elastic_pw').data))
    # client.info()

    # Get the first patch of document search
    resp = client.search(index = ES_index_name, query={"match_all": {}}, size = 10000)
    hits_meta = resp['hits']['hits']

    # Get all documents 
    # Work for all cases when there are more than 10000 restaurants in that city/state
    while resp['hits']['total']['relation'] == 'gte':
        max_ts = max([i['_source']['timestamp'] for i in resp['hits']['hits']])
        resp = client.search(index = ES_index_name, query={"range": {"timestamp":{"gte":max_ts}}}, size = 10000)
        hits_meta.extend(resp['hits']['hits'])
        
    total_restaurants = len(hits_meta)
    # Create a dataframe with all source info for all restaurants in San Francisco
    df = pd.DataFrame()
    for i in range(total_restaurants):
        df = df.append(pd.json_normalize(hits_meta[i]['_source']))
    df = df.reset_index(drop=True)
    return df


# Create dataframe after connecting to Elasticsearch given the index name
df = create_df_from_ES_index('/Users/dannie/Documents/Projects/restaurant-recommendation-system/Yelp_ES_config.properties','yelp-fusion-restaurants-sf')


def data_cleaning(df, city):
    #On the df,price for $$$$ restaurants are not showing =>re-assign
    df.loc[df['price']=='$$$$','price'] = '$$$$'
    #Re-assign restaurants with NA price to Unknown
    df.loc[df['price'].isna(),'price'] = 'Unknown'
    #Remove restaurant with empty categories
    df = df[df['categories'].apply(len) != 0]
    # Convert the list of dictionaries with alias and titles into the list of all categories that the restaurant has using the title values
    df['categories'] = df['categories'].apply(lambda x: pd.DataFrame.from_dict(x)['title'].to_list())
    # Remove the mismatched transactions to only include options for delivery, pickup, or restaurant reservation
    df['transactions'] = df['transactions'].apply(lambda y: list(filter(lambda x: x in ['delivery','pickup','restaurant_reservation'], y)))
    # Convert the list of address dispplayed into a string
    df['location.display_address'] = df['location.display_address'].apply(lambda x: ', '.join(map(str, x)))
    # Drop some un-used columns  - some columns has specific values to when the doc was indexed on Elasticsearch
    df = df.drop(['hours','phone','is_closed','timestamp', 'messaging.url', 'messaging.use_case_text'], axis=1)
    # Convert different transaction categories into columns and join it with the original dataframe
    transaction_df = pd.get_dummies(df['transactions'].explode()).groupby(level=0).sum()
    df = df.join(transaction_df)
    # Only get the restaurants with price '$','$$','$$$', or'$$$$'
    df = df[df.price.isin(['$','$$','$$$','$$$$','Unknown'])]
    # For the restaurants with nan state (location), extract the state from the display_address
    df.loc[df['location.state'].isna(),'location.state'] = df[df['location.state'].isna()]['location.display_address'].str.split(', ').apply(lambda x: x[-1].split(' ')[0])
    # Only get the restaurants within the city
    df = df[df['location.city'] == city]
    #  Only get the restaurants that has the zipcode with number(digits) pattern
    df = df[df['location.zip_code'].str.isdigit()]
    return df.reset_index(drop=True)


df = data_cleaning(df,'San Francisco')


def show_image(name):
    restaurant = df[df.name == name].reset_index(drop=True)
    for i in range(len(restaurant)):
        print('Restaurant:',name,'\t Location:',restaurant.loc[i,'location.display_address'])
        [display(Image(pics)) for pics in restaurant.loc[i,'photos']]


show_image("Brenda's French Soul Food")

Restaurant: Brenda's French Soul Food 	 Location: 652 Polk St, San Francisco, CA 94102


show_image("Tartine Bakery")

Restaurant: Tartine Bakery 	 Location: 600 Guerrero St, San Francisco, CA 94110


df.sort_values(by='review_count',ascending=False).head()[['name','review_count','categories','rating','display_phone','price','location.display_address','delivery','pickup']]


df.groupby('rating').name.agg(['count']).reset_index(drop=False)


fig = px.bar(df.groupby('rating').name.agg(['count']).reset_index(drop=False)
, x="rating", y="count", color="rating", text_auto=True, title = "Total Restaurants by Yelp Ratings")
fig.update_layout(xaxis_title = "Rating",yaxis_title = "Total Restaurants")
fig.show()


# Get the restaurant counts for all categories
# Note: 1 restaurant could have multiple categories
cat = df['categories'].apply(pd.Series).stack().reset_index(drop=True).value_counts()
cat = cat.to_frame(name='count').reset_index().rename(columns={'index':'category'})


fig = go.Figure(data=[go.Pie(labels=cat.head(10).category, values=cat.head(10)['count'],hole = 0.3,
                            pull = [0.2,0,0,0,0,0,0,0,0,0])])
fig.update_layout(title = 'Top 10 categories')
fig.show()


fig = px.scatter_mapbox(df, lat="coordinates.latitude", lon="coordinates.longitude", color="location.zip_code",
                  zoom=11, hover_name  = "name", hover_data = ['rating','review_count','transactions','categories'])
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(title = 'SF Restaurant Locations')
fig.show()


df.groupby(['location.zip_code','rating']).name.agg(['count']).reset_index(drop=False)


fig = px.sunburst(df.groupby(['location.zip_code','rating']).name.agg(['count']).reset_index(drop=False), path=['location.zip_code','rating'], values='count', title = 'Total Restaurants by Zipcode and Rating')
fig.show()


delivery_pickup = df.groupby('location.zip_code').agg({'delivery':'sum','pickup':'sum'}).reset_index(drop=False)


delivery_pickup[delivery_pickup['location.zip_code'].isin(['94110','94103','94102','94109','94133'])]


fig = px.bar(delivery_pickup[delivery_pickup['location.zip_code'].isin(['94110','94103','94102','94109','94133'])],
             x=['delivery','pickup'], y="location.zip_code", orientation='h', barmode='group',title='Total Restaurants by Delivery/Pickup Service for Top 5 Zipcodes With The Most Restaurants', 
            labels = {'location.zip_code':'Zipcode','variable':'Type','value':'Total Restaurants'})
fig.show()


rest_by_price = df.groupby('price').name.agg(['count']).reset_index(drop=False).sort_values(by='count',ascending=False)


fig = px.treemap(rest_by_price, path=['price'], values='count', title = 'Total Restaurants by Price')
fig.update_traces(root_color="lightgrey")
fig.show()


# This function returns the updated dataframe with the name of restaurants and the final content
# that is the combination of many conditions that we want the recommendation system to consider
#Final content column is cleaned properly for the pre-prosessing step before building the recommendation
def create_final_content(similar_conditions):
    data = df.copy()
    # New mapping for price
    data['price'] = data['price'].map({'$':'$','$$':'2$','$$$':'3$','$$$$':'4$','Unknown':'unknown'})
    # Update transaction to exclude special character & space (only _ in this case)
    # Then join them together into 1 big string
    data['transactions']= data['transactions'].apply(lambda x: ' '.join([i.replace('_','') for i in x]))
    # Update review_count with new group bucket by review count bins
    group_bin = [1, 100, 200, 400, 600, 800, 1000, 2000, np.inf]
    group_name = ['group1','group2','group3','group4','group5','group6','group7','group8']
    data['review_count'] = pd.cut(data.review_count, bins = group_bin, labels = group_name, include_lowest = True)
    # Update ratings with new rating group bin
    data['rating'] = pd.cut(data['rating'], bins = [1,2,3,4,5,np.inf], labels = ['r1','r2','r3','r4','r5'], include_lowest = True,right=False)
    # Update categories to be all lower cases and remove space
    # Then join them together into 1 big string
    data['categories'] = data['categories'].apply(lambda x: ' '.join([i.lower().replace(' ','') for i in x]))
    # Create a new column by combining the values for categories, review count, rating, transactions, price
    #Different conditions based on whether the we have 1 single condition (string format) or multiple conditions (list)
    data['final_content'] = data[similar_conditions].apply(lambda x: ' '.join(x.astype(str)), axis=1) if isinstance(similar_conditions,list) else data[similar_conditions]
    # Drop duplicates for restaurants based on name and contents that we want to compare the restaurant with
    data = data[['name','final_content']].drop_duplicates().reset_index(drop=True)
    return data


def build_Yelp_recommendation_system(similar_conditions, restaurant_name, top_n):
    # Create the update df with the final content column based on similar conditions
    data = create_final_content(similar_conditions)
    #  Create the count sparse matrix for the content column and remove stop words in case there are extra unnecessary strings
    cnt_matrix = CountVectorizer(stop_words='english').fit_transform(data['final_content'])
    # Calculate cosine similarity score
    cosine_sim = cosine_similarity(cnt_matrix)
    # Get the chosen restaurant's index based on name
    restaurant_ind = data[data.name == restaurant_name].index[0]
    # Get the list of restaurant index and its cosine similarity score as tuple 
    similar_list = list(enumerate(cosine_sim[restaurant_ind]))
    # Sort this  list based on cosine similarity score from high to low 
    highest_similar_list = sorted(similar_list,key=lambda x:x[1],reverse=True)
    # There are cases where there are extremely good matches that the recommended restaurant has same cosine similarity score with the chosen restaurant (=1)
    # Because the list is sorted based on the cosine similarities score and not restaurant name, it could have different index instead of being the first index
    # Therefore we will exclude that restaurant name  out of the list and get the top n recommended restaurant
    #highest_similar_list.remove((restaurant_ind, 1.0))
    highest_similar_list = [item for item in highest_similar_list if item[0]!= restaurant_ind]
    
    print('If the restaurant name shows up in the recommended list more than once, it means that while the restaurant has the same name, it has different conditions listed compared to the other location.')
    print('Below is a list of top',top_n, 'recommended restaurants based on',', '.join(similar_conditions) if isinstance(similar_conditions,list) else similar_conditions, 'compared to',restaurant_name+':')
    [print(data.name.loc[i[0]]) for i in highest_similar_list[:top_n]]


build_Yelp_recommendation_system(['price','categories','rating','review_count'], 'San Tung',10)

If the restaurant name shows up in the recommended list more than once, it means that while the restaurant has the same name, it has different conditions listed compared to the other location.
Below is a list of top 10 recommended restaurants based on price, categories, rating, review_count compared to San Tung:
Mensho Tokyo SF
Marugame Udon
Nan King China Bistro
Yuanbao Jiaozi
Sichuan Tasty Restaurant
Guilin Rice Noodles House
Izakaya Sozai
Thanh Long
Baan Yaai
Yummy Yummy


build_Yelp_recommendation_system('categories','Arsicault',10)

If the restaurant name shows up in the recommended list more than once, it means that while the restaurant has the same name, it has different conditions listed compared to the other location.
Below is a list of top 10 recommended restaurants based on categories compared to Arsicault:
Devil’s Teeth Baking Company
Eterna Primavera Bakery
Devil's Teeth Baking Company
Breadbelly
Tartine Inner Sunset
Vive La Tarte
Universal Bakery
Heavenly Bakery
Mama's On Washington Square
Home Plate

	name	review_count	categories	rating	display_phone	price	location.display_address	delivery
2992	Brenda's French Soul Food	11898	[Breakfast & Brunch, Southern, Cajun/Creole]	4.0	(415) 345-8100	$$	652 Polk St, San Francisco, CA 94102	1
594	Fog Harbor Fish House	8714	[Seafood, Wine Bars, Cocktail Bars]	4.5	(415) 969-2010	$$	39 Pier, Ste a - 202, San Francisco, CA 94133	0
1421	Tartine Bakery	8664	[Bakeries, Cafes, Desserts]	4.0	(415) 487-2600	$$	600 Guerrero St, San Francisco, CA 94110	1
3251	House of Prime Rib	8306	[American (Traditional), Steakhouses, Wine Bars]	4.0	(415) 885-4605	$$$	1906 Van Ness Ave, San Francisco, CA 94109	0
2148	San Tung	7818	[Chinese, Chicken Wings, Noodles]	4.0	(415) 242-0828	$$	1031 Irving St, San Francisco, CA 94122	1

	rating	count
0	1.0	64
1	1.5	29
2	2.0	99
3	2.5	146
4	3.0	316
5	3.5	826
6	4.0	1425
7	4.5	886
8	5.0	285

	location.zip_code	rating	count
0	34016	4.5	1
1	90012	3.0	1
2	94013	3.5	1
3	94016	3.5	1
4	94016	5.0	2
...	...	...	...
220	94199	5.0	1
221	94214	5.0	1
222	94588	4.5	1
223	94619	2.5	1
224	94619	3.0	1

	location.zip_code	delivery	pickup
5	94102	193.0	135
6	94103	233.0	181
12	94109	231.0	171
13	94110	308.0	235
30	94133	186.0	138

Recommendation Engine for Yelp Restaurants in San Francisco - CA¶

Authors: Dannie Vo and Stew Seo¶

Connecting to Elasticsearch:¶

Data Cleaning:¶

Data Exploration:¶

Show the restaurant's main images on Yelp¶

Top 5 restaurants with the most reviews:¶

Total restaurants by Yelp ratings:¶

Calculating the total number of restaurants by categories¶

All restaurants by location¶

Total restaurants breakdown by location (zipcode) and Yelp rating scale:¶

Restaurants by Zipcode with Delivery/Pickup service:¶

Restaurants by Price Range/Rating:¶

Building the Recommendation Engine:¶

Build recommendation system to return the list of recommended restaurants based on your chosen restaurant name and one or more restaurants fields and the size of the top recommended list:¶

Please choose one or more of these fields: price, categories, rating, review_count, transactions¶