!pip install bertopic gensim pyLDAvis ipykernel


import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from bertopic import BERTopic
import torch
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis


nltk.download("stopwords")
nltk.download("punkt_tab")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.

True


googleReview = pd.read_excel("Google_12_months.xlsx")
trustPilot = pd.read_excel("Trustpilot_12_months.xlsx")


googleReview.rename(columns= {"Club's Name": "Location Name", "Overall Score": "Review Score"}, inplace = True)
trustPilot.rename(columns= {"Review Content": "Comment", "Review Stars": "Review Score"}, inplace = True)


def data_preprocessing(data, col_name):

  data = data.copy()
  print(data.shape)
  stop_words = set(stopwords.words("english"))
  data = data.dropna(subset = [col_name])
  classifier = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection", truncation=True)

  data[col_name] = data[col_name].str.replace(r'\d+', '', regex = True)

  def process_text(text):

    lang_result = classifier(text)

    if lang_result[0]["label"] == "en":

      return " ".join([word for word in word_tokenize(text.lower()) if word not in stop_words and word.isalpha()])
    return ""

  data[col_name] = data[col_name].apply(process_text)
  return data


col_name = "Comment"
print("Google Reviews")
clean_google = data_preprocessing(googleReview, col_name)
print("Trustpilot Reviews")
clean_pilot = data_preprocessing(trustPilot, col_name)

Google Reviews
(23250, 7)

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

(…)6de7952f7c11ab059eca145a0a727afce0db2865:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

Trustpilot Reviews
(16673, 15)

Device set to use cuda:0


def wordcloud(cloud):

  custom_stopwords = {"gym", "puregym"}
  stopwords = STOPWORDS.union(custom_stopwords)

  for title, i in  cloud.items():

    wordcloud = WordCloud(stopwords= stopwords, width=800, height=400, background_color='white').generate(" ".join(i))

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"The wordcloud for: {title}")
    plt.show()


def analysis(data):

  neg_reviews = {}

  for name, df in data.items():

    all_words = [word for comment in df['Comment'] for word in word_tokenize(comment)]

    print(f"{name} have {df['Location Name'].str.lower().nunique()} unique location")
    print(f"{name} frequency distribution is {FreqDist(all_words)}")
    print(f"{name} has this shape {df.shape}")

    common_words = FreqDist(all_words).most_common(10)
    neg_reviews[name.split()[0]] = df[df["Review Score"] < 3]
    neg_words = [word for comment in neg_reviews[name.split()[0]]['Comment'] for word in word_tokenize(comment)]

    print(f"{name} frequency distribution of negative words is {FreqDist(neg_words)}")
    print(f"{name} negative reviews has this shape {neg_reviews[name.split()[0]].shape}")

    words, counts = zip(*common_words)

    plt.figure(figsize=(10, 5))
    plt.bar(words, counts, color='skyblue')
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.title(f"Top 10 Most Common Words")
    plt.show()

    cloud = {f"All Reviews for {name.split()[0]}": all_words, f"Negative Reviews for {name.split()[0]}":neg_words}

    wordcloud(cloud)

  return neg_reviews


datasets = {"google review": clean_google, "trustpilot reviews": clean_pilot}
neg_dataset = analysis(datasets)

google review have 512 unique location
google review frequency distribution is <FreqDist with 13492 samples and 241042 outcomes>
google review has this shape (13898, 7)
google review frequency distribution of negative words is <FreqDist with 8414 samples and 81163 outcomes>
google review negative reviews has this shape (2785, 7)

trustpilot reviews have 374 unique location
trustpilot reviews frequency distribution is <FreqDist with 12859 samples and 277004 outcomes>
trustpilot reviews has this shape (16673, 15)
trustpilot reviews frequency distribution of negative words is <FreqDist with 8410 samples and 95444 outcomes>
trustpilot reviews negative reviews has this shape (3543, 15)


common_locations = len(set(clean_google["Location Name"].str.lower()).intersection(set(clean_pilot["Location Name"].str.lower())))
print(f'Number of common locations: {common_locations}')

Number of common locations: 312


google_neg = None
trustpilot_neg = None


for name, df in neg_dataset.items():
  if name == "google":
    google_neg = df
  else:
    trustpilot_neg = df


common_google = google_neg[google_neg["Location Name"].isin(trustpilot_neg["Location Name"])][["Comment","Location Name"]]
common_trustpilot = trustpilot_neg[trustpilot_neg["Location Name"].isin(common_google["Location Name"])][["Comment", "Location Name"]]
merged_data = pd.concat([common_google, common_trustpilot], ignore_index=True)


merged_data.shape

(3888, 2)


def bert_modelling(data):

  data = data.copy()

  merged_neg_clean = data["Comment"].tolist()
  model = BERTopic(verbose=True)
  model.fit(merged_neg_clean)
  topics, probabilities = model.transform(merged_neg_clean)

  data["topic"] = topics

  return model, data, topics, probabilities


model, topics_data, topics, probabilities = bert_modelling(merged_data)

2025-04-01 03:01:29,078 - BERTopic - Embedding - Transforming documents to embeddings.

Batches:   0%|          | 0/122 [00:00<?, ?it/s]

2025-04-01 03:01:31,289 - BERTopic - Embedding - Completed ✓
2025-04-01 03:01:31,290 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-01 03:01:54,395 - BERTopic - Dimensionality - Completed ✓
2025-04-01 03:01:54,396 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-01 03:01:54,540 - BERTopic - Cluster - Completed ✓
2025-04-01 03:01:54,544 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-01 03:01:54,694 - BERTopic - Representation - Completed ✓

Batches:   0%|          | 0/122 [00:00<?, ?it/s]

2025-04-01 03:01:55,924 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-01 03:01:55,936 - BERTopic - Dimensionality - Completed ✓
2025-04-01 03:01:55,937 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-01 03:01:56,069 - BERTopic - Cluster - Completed ✓


def bert_visuals(bert_model):

  print(f"Total number of topics: {len(bert_model.get_topics())}")
  topic_freq = bert_model.get_topic_freq()
  print(f"Outliers:\n {topic_freq[topic_freq['Topic'] == -1]} \n")
  print(f"Top topics along with their document frequencies: \n {topic_freq[topic_freq['Topic'] != -1].head(10)} \n")
  top_2_topics = topic_freq[topic_freq['Topic'] != -1].head(2)["Topic"].tolist()
  for tp in top_2_topics:
    print(f"Topic {tp} has the following top words: \n {bert_model.get_topic(tp)} \n")

  fig1 = bert_model.visualize_topics()
  fig1.show()

  fig2 = bert_model.visualize_barchart()
  fig2.show()

  fig3 = bert_model.visualize_heatmap()
  fig3.show()


bert_visuals(model)

Total number of topics: 50
Outliers:
    Topic  Count
4     -1   1628 

Top topics along with their document frequencies: 
     Topic  Count
2       0    195
11      1    150
10      2    142
3       3    137
12      4    123
20      5    115
27      6     83
7       7     71
8       8     58
41      9     57 

Topic 0 has the following top words: 
 [('air', 0.06535853622220478), ('conditioning', 0.039109409267406005), ('hot', 0.0341153751577972), ('aircon', 0.030904603329895447), ('con', 0.02539572030351039), ('ac', 0.02335473164547456), ('summer', 0.022840948850730125), ('heat', 0.022368245534383064), ('working', 0.018771570123393193), ('temperature', 0.017792814503912378)] 

Topic 1 has the following top words: 
 [('rude', 0.021586991254173425), ('manager', 0.02069975537194293), ('staff', 0.01869429753354046), ('member', 0.01668603498125788), ('gym', 0.01588568912675759), ('women', 0.012400406597117608), ('personal', 0.011682768418990724), ('reviews', 0.011341051914656263), ('men', 0.010887589494607062), ('told', 0.01059952186619874)]


top_10_topic_Location = topics_data[(topics_data["topic"] > -1) & (topics_data["topic"] < 10)]
top_10_topic_Location[["Location Name", "topic"]].value_counts().head(10)


print(f"google top 20 locations with the most negative reviews {common_google['Location Name'].value_counts().head(20)}")
print("\n")
print(f"Trustpilot top 20 locations with the most negative reviews {common_trustpilot['Location Name'].value_counts().head(20)}")

google top 20 locations with the most negative reviews Location Name
London Stratford            59
London Canary Wharf         26
London Enfield              25
London Swiss Cottage        24
Birmingham City Centre      21
London Leytonstone          21
New Barnet                  20
Wakefield                   19
Bradford Thornbury          19
Walsall Crown Wharf         18
London Hoxton               18
London Seven Sisters        18
London Hayes                17
Manchester Exchange Quay    17
Sutton Times Square         16
London Bermondsey           16
Nottingham Colwick          16
London Piccadilly           15
Leeds City Centre North     15
London Muswell Hill         15
Name: count, dtype: int64


Trustpilot top 20 locations with the most negative reviews Location Name
Leicester Walnut Street      50
London Enfield               23
London Stratford             22
Burnham                      20
London Bermondsey            18
York                         16
Maidenhead                   16
London Seven Sisters         16
London Finchley              16
London Hayes                 16
London Swiss Cottage         15
London Hammersmith Palais    15
Northwich                    15
London Bromley               15
Dudley Tipton                14
Watford Waterfields          14
Basildon                     14
Bradford Thornbury           14
Telford                      14
Birmingham City Centre       14
Name: count, dtype: int64


google_top_20 = common_google['Location Name'].value_counts().head(20).reset_index()
google_top_20.columns = ['Location Name', 'Google Negative Reviews']

trustpilot_top_20 = common_trustpilot['Location Name'].value_counts().head(20).reset_index()
trustpilot_top_20.columns = ['Location Name', 'Trustpilot Negative Reviews']

merged_top_20 = pd.merge(google_top_20, trustpilot_top_20, on="Location Name", how="inner")

print("Common Locations with Negative Reviews from Both Google and Trustpilot:")
merged_top_20

Common Locations with Negative Reviews from Both Google and Trustpilot:


google_neg_loc = google_neg["Location Name"].value_counts().reset_index()
pilot_neg_loc = trustpilot_neg["Location Name"].value_counts().reset_index()


merged_count = pd.merge(google_neg_loc, pilot_neg_loc, on='Location Name', how='outer')
merged_count["count_x"] = merged_count["count_x"].fillna(0)
merged_count["count_y"] = merged_count["count_y"].fillna(0)

merged_count["Total by loaction"] = merged_count["count_x"] + merged_count["count_y"]
merged_count = merged_count.sort_values(by='Total by loaction', ascending= False)


merged_count


google_top_30 = google_neg[google_neg["Location Name"].isin(merged_count["Location Name"].head(30))][["Comment", "Location Name"]]
pilot_top_30 = trustpilot_neg[trustpilot_neg["Location Name"].isin(merged_count["Location Name"].head(30))][["Comment", "Location Name"]]

top_30_wordcloud = {"Google": google_top_30["Comment"], "Trustpilot": pilot_top_30["Comment"]}
wordcloud(top_30_wordcloud)


top_30_comments =  pd.concat([google_top_30,  pilot_top_30], ignore_index = True)
df_duplicated = pd.concat([top_30_comments, top_30_comments], ignore_index=True)
model_30, top_30_data, topic_30, probabilities_30  = bert_modelling(df_duplicated)

2025-04-01 04:03:29,736 - BERTopic - Embedding - Transforming documents to embeddings.

Batches:   0%|          | 0/62 [00:00<?, ?it/s]

2025-04-01 04:03:30,963 - BERTopic - Embedding - Completed ✓
2025-04-01 04:03:30,964 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-01 04:03:38,200 - BERTopic - Dimensionality - Completed ✓
2025-04-01 04:03:38,201 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-01 04:03:38,270 - BERTopic - Cluster - Completed ✓
2025-04-01 04:03:38,274 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-01 04:03:38,389 - BERTopic - Representation - Completed ✓

Batches:   0%|          | 0/62 [00:00<?, ?it/s]

2025-04-01 04:03:39,148 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-01 04:03:39,155 - BERTopic - Dimensionality - Completed ✓
2025-04-01 04:03:39,155 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-01 04:03:39,221 - BERTopic - Cluster - Completed ✓


bert_visuals(model_30)

Total number of topics: 70
Outliers:
    Topic  Count
1     -1    304 

Top topics along with their document frequencies: 
     Topic  Count
6       0     88
17      1     72
4       2     68
9       3     62
42      4     58
22      5     42
0       6     42
14      7     38
68      8     36
10      9     36 

Topic 0 has the following top words: 
 [('smell', 0.03209920574772303), ('smelly', 0.030228486724505444), ('air', 0.029442284051825408), ('disgusting', 0.024554686369513766), ('aircon', 0.02451718576468661), ('worst', 0.023792796209157436), ('stinks', 0.02200092847548957), ('ever', 0.021161352942561253), ('ventilation', 0.0196847984820827), ('urine', 0.018953204835633626)] 

Topic 1 has the following top words: 
 [('pin', 0.07810738736606905), ('pass', 0.06562447771361557), ('code', 0.05415072747804314), ('day', 0.05070777212915479), ('sent', 0.041654405752340876), ('app', 0.039985745448062736), ('access', 0.038156671096345636), ('bought', 0.03340465626022348), ('work', 0.030147532664350097), ('number', 0.029989309086047054)]


model_name = "bhadresh-savani/bert-base-uncased-emotion"
classifier = pipeline("text-classification", model = model_name, truncation=True)

config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


google_neg_emotion = google_neg[["Comment", "Location Name"]].copy()
trustpilot_neg_emotion = trustpilot_neg[["Comment", "Location Name"]].copy()


def emotion_analysis(text):

  result = classifier(text)[0]["label"]

  return result


google_neg_emotion["emotion"] = google_neg_emotion["Comment"].apply(emotion_analysis)
trustpilot_neg_emotion["emotion"] = trustpilot_neg_emotion["Comment"].apply(emotion_analysis)


def emotion_chart(values):

  for name, i in values.items():

    plt.bar(i.index, i.values)
    plt.xlabel("Emotion")
    plt.ylabel("Distribution")
    plt.title(f"Distribution for all negative reviews in {name}")
    plt.show()


emotion_count_google = google_neg_emotion["emotion"].value_counts()
emotion_count_pilot = trustpilot_neg_emotion["emotion"].value_counts()


emotion_chart({"google": emotion_count_google, "Trustpilot" : emotion_count_pilot})


neg_only= pd.concat([google_neg_emotion[google_neg_emotion["emotion"]== "anger"], trustpilot_neg_emotion[trustpilot_neg_emotion["emotion"]== "anger"]], ignore_index = True)


model_anger, anger_data, topic_anger, probabilities_anger  = bert_modelling(neg_only)

2025-04-01 03:13:58,891 - BERTopic - Embedding - Transforming documents to embeddings.

Batches:   0%|          | 0/76 [00:00<?, ?it/s]

2025-04-01 03:14:00,256 - BERTopic - Embedding - Completed ✓
2025-04-01 03:14:00,257 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-01 03:14:12,737 - BERTopic - Dimensionality - Completed ✓
2025-04-01 03:14:12,739 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-01 03:14:12,824 - BERTopic - Cluster - Completed ✓
2025-04-01 03:14:12,827 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-01 03:14:12,916 - BERTopic - Representation - Completed ✓

Batches:   0%|          | 0/76 [00:00<?, ?it/s]

2025-04-01 03:14:13,683 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-01 03:14:13,691 - BERTopic - Dimensionality - Completed ✓
2025-04-01 03:14:13,692 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-01 03:14:13,776 - BERTopic - Cluster - Completed ✓


bert_visuals(model_anger)

Total number of topics: 36
Outliers:
    Topic  Count
0     -1    826 

Top topics along with their document frequencies: 
     Topic  Count
1       0    308
2       1    107
3       2     83
5       3     75
21      4     69
16      5     66
4       6     57
25      7     55
9       8     51
22      9     48 

Topic 0 has the following top words: 
 [('nah', 3.619248420447182), ('know', 1.4617849136528802), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)] 

Topic 1 has the following top words: 
 [('rude', 0.03431141872944007), ('staff', 0.02840506990443558), ('member', 0.02520934244959189), ('manager', 0.023528607119674173), ('gym', 0.02139922651380406), ('personal', 0.018953258511792522), ('aggressive', 0.016668275528548066), ('reviews', 0.016001951029404715), ('puregym', 0.015331679815607645), ('one', 0.01292573313094035)]


neg_only[["Location Name", "emotion"]].value_counts().head(3)


phi_neg = pd.concat([google_neg[["Comment", "Location Name"]],  trustpilot_neg[["Comment", "Location Name"]]], ignore_index = True)


torch.random.manual_seed(0)

<torch._C.Generator at 0x7e26d6572930>


model_path = "microsoft/Phi-4-mini-instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.

modeling_phi3.py:   0%|          | 0.00/54.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

Device set to use cuda


generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}


def phi_model(text):

    messages = [
        {"role": "system", "content": "You work as a data analyst insights guru for a top gym in the UK and you want to find topics for improvements from customer reviews. You must return exactly 3 key topics as strings, strictly formatted as 'topic 1', 'topic 2', 'topic 3'."},
        {"role": "user", "content": "In the following customer review interaction, pick out exactly 3 main topics and return them as a valid Python list: Broken equipment, overcrowded, dirty locker rooms, and unhelpful staff. Look elsewhere!."},
        {"role": "assistant", "content": "'Broken equipment', 'overcrowded', 'unhelpful staff'"},
        {"role": "user", "content": f"In the following customer review interaction, pick out exactly 3 main topics and return only those 3 as a string: {text}"},
        ]
    output = pipe(messages, **generation_args)[0]['generated_text'].replace("'", '"')

    return output


phi_neg["topics"] = phi_neg["Comment"].apply(phi_model)


phi_neg.to_csv("phi_model.csv", index=False)


model_phi = BERTopic(verbose=True)
topics_phi, probabilities_phi = model_phi.fit_transform(phi_neg['topics'].tolist())

2025-04-01 05:20:24,451 - BERTopic - Embedding - Transforming documents to embeddings.

Batches:   0%|          | 0/198 [00:00<?, ?it/s]

2025-04-01 05:20:26,769 - BERTopic - Embedding - Completed ✓
2025-04-01 05:20:26,770 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-01 05:20:41,070 - BERTopic - Dimensionality - Completed ✓
2025-04-01 05:20:41,072 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-01 05:20:41,312 - BERTopic - Cluster - Completed ✓
2025-04-01 05:20:41,316 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-01 05:20:41,456 - BERTopic - Representation - Completed ✓


bert_visuals(model_phi)

Total number of topics: 101
Outliers:
    Topic  Count
1     -1   2247 

Top topics along with their document frequencies: 
     Topic  Count
2       0    373
7       1    199
11      2    182
23      3    144
3       4    141
36      5    139
26      6     97
49      7     93
38      8     88
6       9     80 

Topic 0 has the following top words: 
 [('unhelpful', 0.12300326579982984), ('overcrowded', 0.10022180631271357), ('broken', 0.09263096587012651), ('staff', 0.0736963666938909), ('equipment', 0.038364977494423844), ('away', 0.0023281513814752044), ('overcrowdedstaff', 0.002150786506725446), ('unavailability', 0.0019639597145591097), ('properly', 0.0017536863381075911), ('put', 0.0017204835562337562)] 

Topic 1 has the following top words: 
 [('gym', 0.023842928637476644), ('experience', 0.020038469076131472), ('poor', 0.01742995207675169), ('workout', 0.01630277849121239), ('worse', 0.016142740962067064), ('inadequate', 0.01405116986141448), ('faulty', 0.01307668117884842), ('terrible', 0.012086116101734209), ('worst', 0.011972406023336333), ('equipment', 0.011579355751910698)]


def phi_improvements(text):

    messages_1 = [
        {"role": "system", "content": "You work as a data analyst insights guru for a top gym in the UK and you want to compress a given list into collated topics. You should return these in a numbered list as business insights that can be used to improve the business"},
        {"role": "user", "content": "In the following list containing the main extracted topics from customer reviews, group or compress the topics and return them with actionable insights in a numbered list: [Poor service experience','Comparison of gym and service','Overall satisfaction','Service improvement needed','bad quality','Cleanliness of establishment','Neutral dining experience','Potential health and safety concerns','Overall satisfaction',]"},
        {"role": "assistant", "content": "1. Cleanliness: Increase cleaning frequency and enforce hygiene rules. \n 2. Equipment maintenance: Repair faulty machines promptly and schedule regular maintenance. \n ..."},
        {"role": "user", "content": f"In the following list containing the main extracted topics from customer reviews, group or compress the topics and return them with actionable insights in a numbered list: {text}"},
        ]
    output = pipe(messages_1, **generation_args)[0]['generated_text']

    return output


split_topic = np.array_split( phi_neg['topics'].tolist(), 2)


for i in split_topic:
  print(phi_improvements(i))

1. Overcrowding: Implement a reservation system to manage gym capacity and reduce overcrowding.
2. Customer service: Train staff to improve customer service and professionalism.
3. Equipment maintenance: Regularly check and repair gym equipment to prevent breakdowns.
4. Hygiene: Increase cleaning frequency and enforce hygiene rules to maintain clean facilities.
5. Staff behavior: Provide training to staff to improve their behavior and professionalism.
6. Gym atmosphere: Create a welcoming and motivating gym atmosphere to enhance the customer experience.
7. App functionality: Improve the app's functionality to reduce wasted time and address day pass and turnstile access issues.
1. Temperature control: Install adjustable thermostats in showers and ensure consistent water temperature.
2. Membership fees: Review pricing structure and consider offering tiered membership options.
3. Outdoor facilities: Regularly maintain and upgrade outdoor amenities to enhance user experience.
4. Long wait times: Implement a more efficient booking system and increase staff during peak hours.
5. Lack of customer service: Provide regular training to staff on customer service skills and communication.
6. Improve communication: Establish clear communication channels and provide regular updates to members.
7. Hygiene cleanliness standards: Enforce strict hygiene protocols and conduct regular inspections.
8. Changing room maintenance: Regularly clean and maintain changing rooms to ensure a pleasant experience.
9. Staff management: Invest in staff training and development to improve overall service quality.
10. Shower temperature: Install adjustable thermostats in showers and ensure consistent water temperature.
11. Cost saving: Review and optimize operational costs to offer more competitive membership fees.
12. Millhouses: Regularly maintain and upgrade facilities to enhance user experience.
13. Overcrowded: Implement a booking system to manage peak times and prevent overcrowding.
14. Dirty locker rooms: Regularly clean and maintain locker rooms to ensure a pleasant experience.
15. Unhelpful staff: Provide regular training to staff on customer service skills and communication.


comments_neg = phi_neg["Comment"]


tokenized_docs = [word_tokenize(comment) for comment in comments_neg]
dictionary = corpora.Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=2, no_above=0.5)


corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

num_topics = 10
passes = 20


lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=passes)


for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))

Topic: 0 
Words: 0.027*"staff" + 0.017*"manager" + 0.015*"rude" + 0.013*"member" + 0.011*"one" + 0.011*"members" + 0.009*"said" + 0.008*"personal" + 0.008*"left" + 0.008*"customer"
Topic: 1 
Words: 0.046*"water" + 0.030*"showers" + 0.025*"cold" + 0.024*"broken" + 0.020*"shower" + 0.017*"months" + 0.015*"hot" + 0.015*"machine" + 0.014*"machines" + 0.013*"order"
Topic: 2 
Words: 0.018*"membership" + 0.017*"email" + 0.016*"day" + 0.015*"pin" + 0.015*"get" + 0.014*"code" + 0.014*"fee" + 0.013*"joining" + 0.013*"pass" + 0.011*"account"
Topic: 3 
Words: 0.023*"parking" + 0.014*"pure" + 0.013*"back" + 0.011*"puregym" + 0.010*"park" + 0.009*"one" + 0.009*"free" + 0.009*"car" + 0.007*"date" + 0.007*"would"
Topic: 4 
Words: 0.015*"staff" + 0.013*"members" + 0.013*"people" + 0.011*"management" + 0.010*"like" + 0.010*"puregym" + 0.008*"member" + 0.008*"machine" + 0.007*"time" + 0.007*"one"
Topic: 5 
Words: 0.030*"membership" + 0.013*"cancel" + 0.012*"pay" + 0.011*"month" + 0.011*"would" + 0.011*"use" + 0.010*"gyms" + 0.009*"get" + 0.009*"go" + 0.008*"puregym"
Topic: 6 
Words: 0.049*"air" + 0.019*"conditioning" + 0.016*"music" + 0.016*"hot" + 0.015*"con" + 0.015*"temperature" + 0.009*"classes" + 0.009*"loud" + 0.009*"pt" + 0.009*"working"
Topic: 7 
Words: 0.039*"classes" + 0.039*"class" + 0.017*"one" + 0.014*"time" + 0.013*"get" + 0.010*"minutes" + 0.010*"booked" + 0.009*"instructor" + 0.009*"first" + 0.009*"work"
Topic: 8 
Words: 0.018*"equipment" + 0.016*"changing" + 0.015*"machines" + 0.010*"dirty" + 0.010*"toilets" + 0.010*"always" + 0.010*"place" + 0.009*"staff" + 0.009*"cleaning" + 0.009*"weights"
Topic: 9 
Words: 0.052*"equipment" + 0.025*"people" + 0.024*"use" + 0.019*"busy" + 0.018*"gyms" + 0.018*"machines" + 0.013*"many" + 0.012*"time" + 0.011*"go" + 0.011*"get"


pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

Table of Contents¶

Required Packages¶

Conducting initial data investigation¶

Findings¶

Dataset Before Cleaning¶

Dataset Cleaning¶

Initial Analysis After Cleaning¶

PureGym Reviews on Google¶

Negative Reviews (Score < 3)¶

PureGym Reviews on Trustpilot¶

Negative Reviews (Score < 3)¶

Common Locations¶

Conducting initial topic modelling¶

Findings¶

Pre-processing before topic modelling¶

Topic Modelling Results (Bertopic)¶

Performing further data investigation¶

Findings¶

Top Locations for Bad Reviews¶

Topic Modeling¶

Conducting emotion analysis¶

Findings¶

Conducting Emotion Analysis Focused on Negative Reviews for PureGym (bert-base-uncased-emotion)¶

Topic Modelling Where the Emotion is Anger¶

Using a large language model from Hugging Face (PHI)¶

Findings¶

BERTopic Modelling¶

Recommendations¶

Final Recommendations:¶

Using Gensim for topic Modelling¶

Findings¶

Using Gensim for Topic Modeling (Max Topics Set to 10)¶

		count
Location Name	topic
Leicester Walnut Street	0	28
Nottingham Colwick	5	16
Billericay	5	13
London Stratford	2	12
Aylesbury	1	10
Walsall Crown Wharf	5	10
Reading Calcot	0	9
Norwich Riverside	1	8
Burnham	2	8
Paisley	0	7

	Location Name	Google Negative Reviews	Trustpilot Negative Reviews
0	London Stratford	59	22
1	London Enfield	25	23
2	London Swiss Cottage	24	15
3	Birmingham City Centre	21	14
4	Bradford Thornbury	19	14
5	London Seven Sisters	18	16
6	London Hayes	17	16
7	London Bermondsey	16	18

	Location Name	count_x	count_y	Total by loaction
362	London Stratford	59.0	22.0	81.0
289	Leicester Walnut Street	11.0	50.0	61.0
326	London Enfield	25.0	23.0	48.0
0	345	0.0	45.0	45.0
364	London Swiss Cottage	24.0	15.0	39.0
...	...	...	...	...
278	La Chaux de Fonds	1.0	0.0	1.0
59	247 - Esbjerg, Randersvej	1.0	0.0	1.0
61	249 - Ikast, Strøget	1.0	0.0	1.0
64	255 - Kalundborg, Sct. Jørgensbjerg	1.0	0.0	1.0
12	124 - Kbh. NV., Lygten	1.0	0.0	1.0

		count
Location Name	emotion
London Stratford	anger	37
Leicester Walnut Street	anger	22
London Enfield	anger	19