“Jeep — The Middle” User Clustering

Background Information

Response to the Ads


The Data

Analysis Objective

Project Dependencies

import pandas as pd, json, re, string, logging, time, numpy as np, matplotlib.pyplot as plt, seaborn as sns, itertools as it, hdbscan, pickle
from classes.user_preprocessor import UserPreProcessor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from MulticoreTSNE import MulticoreTSNE as TSNE
from bokeh.plotting import figure, ColumnDataSource, show, output_notebook; output_notebook()
from bokeh.models import HoverTool

Styling Our Visualizations & Import Our User Data PreProcessor

UPP = UserPreProcessor()

Building Our Data Structures

Handling Dataset Size

jeep_users = {}
def track_users(obj: dict):
""" Builds a dictionary of users that tweeted. User id as key in dictionary."""
if obj['id'] in jeep_users:
jeep_users[obj['id']]['count'] += 1
jeep_users[obj['id']] = {
"screen_name": obj['screen_name'],
"created_at": obj['created_at'],
"description": obj['description'],
"follwers_count": obj['followers_count'],
"location": obj['location'],
"lang": obj['lang'],
"verified": obj['verified'],
"id_str": obj['id_str'],
"count": 0
for file in os.listdir('data/clean_data'):
df = pd.read_feather(f'data/clean_data/{file}')
temp_df = df[df['Ad Name'] == 'Jeep - middle']
for index, row in temp_df['user'].items():

Save jeep_users Dictionary to JSON File

with open('data/descriptions/jeep_users.json', 'r') as fh:
jeep_users = json.loads(fh.read())
with open('data/descriptions/jeep_users.json', 'w') as fh:

Vectorized Twitter Bios


Building the vectorizer

Building the Data Objects

user_count = len(jeep_users)stopwords = UPP.generate_stopwords()
vectorizer = TfidfVectorizer(preprocessor = UPP.replace_www, tokenizer = UPP.tweet_tokenizer, stop_words = stopwords, max_features = user_count//100)
users = []
bios = []
for key, value in jeep_users.items():
if value['description'] is None:
bio = ''
bio = value['description']
bio_matrix = vectorizer.fit_transform(bios)
CPU times: user 46.2 s, sys: 1.41 s, total: 47.6 s
Wall time: 49.8 s

<26144x261 sparse matrix of type '<class 'numpy.float64'>'
with 133592 stored elements in Compressed Sparse Row format>

View Bio Make Up

for i, bio in enumerate(bios[:20]):
print(i, ': ', bio.replace('\n', ' '))
0 : We're just nerds with a passion for games, movies, and everything entertainment. I'm Rick, also try @jdselig, @dpadben & @CanoJaguar | http://dpad.fm/wiki
1 : Proidly paid for MLBTv subscription to watch the Phillies lose 444 games between 2012 and 2017
2 : I like making analogies and pretty things.
3 : Don't postpone joy.
4 : I head PR for the greatest city in the world's tourism board @nycgo @nycgo_press Husband. New Dad to JD. Instagram: http://bit.ly/36rgbNo 🗽
5 : Problym Chyld Fonzirelli® 🏴‍☠️ #PROzart 🥭#DILF #ShadowLawLLC #WOOSH ⚡️#Anniez 🧶 #4600 🏎 #Capish ShadowlawLLC@iCloud.com #Wavior 🌊🏄🏽‍♂️🦈
6 : PhD of Common Sense (Graduated with honors) Masters of Hypocrisy! Liberal to the bone (for twitter purposes only) 😉😉🤔🤔🤫
7 : A high octane dose of journeyman's talents
8 : I’m hot cause I’m fly. you hate cause you not ✨
9 : Philly sports and bad takes
10 : Java Junkie. Marketing Guru. Podcast Creator. Know It All. Views are all MINE - no one else would claim them. She/her
11 : He is greater than I - Imperfect follower of Jesus Christ - Lover of video games and music aficionado - Conservatism is the way - PTA - Bro/Bruh
12 : Sasha Banks is the 🐐!! Proud Chickahominy Tribe member. Fan account
13 : retired nurse ,mom ,sister and grandma. i love sewing ,politics ,football (geaux Saints) ,and reading.
14 : Unapologetic, Prius-driving, middle-aged mom of 4 & Mimi, married for 30+years to her senior prom date. Firebrand. Raising my voice & reclaiming my time.
15 : Just a regular guy/husband/Dad/PT/nerd/ #Northeastern professor with a wide variety of interests. #StarWars #LordOfTheRings #Patriots #craftbeer
16 : hi I’m a nurse and I love to travel • she/her
17 : @IBMWatsonHealth; former @_carbondesign, @apple, @NASAAmes. I speak for none of them.
18 : #obnoxiousmusic #Cardinals #Razorbacks #Futility
19 : I love hippies ☮️

KMeans Clustering of User Bios

Testing Cluster Count

ks = [2, 50, 200, 500]
sil_scores = []
inertias = []

for k in ks:
logging.warning(f'fitting model for {k}')
model = KMeans(n_clusters=k, n_jobs=-1, random_state = 42)
labels = model.labels_
sil_scores.append(silhouette_score(bio_matrix, labels))

# plot the quality metrics for inspection
fig, ax = plt.subplots(2, 1, sharex=True)

plt.plot(ks, inertias, 'o--')
plt.title('kmeans parameter search')

plt.plot(ks, sil_scores, 'o--')
plt.ylabel('silhouette score')
WARNING:root:fitting model for 2
WARNING:root:fitting model for 50
WARNING:root:fitting model for 200
WARNING:root:fitting model for 500
CPU times: user 4min 22s, sys: 31.3 s, total: 4min 53s
Wall time: 2min 31s

Text(0.5, 0, 'k')

Run our KMeans Model @ 200 Clusters

kn_model = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=200, n_init=10, n_jobs=-1, precompute_distances='auto',
random_state=42, tol=0.0001, verbose=0)
KMeans(n_clusters=200, n_jobs=-1, precompute_distances='auto', random_state=42)

View Strongest Features:

def view_strongest_features(model, vectorizer, topk=10):
model_name = model.__class__.__name__
features = vectorizer.get_feature_names()
if model_name is 'KMeans':
relevant_labels = list(set(model.labels_))
centroids = model.cluster_centers_.argsort()[:,::-1]
for label in relevant_labels:
print(f'Cluster {label}:', end=' ')
for ind in centroids[label, :topk]:
print(f'{features[ind]}', end=' ')
view_strongest_features(kn_model, vectorizer, topk=10)Cluster 0: advocate former for and public an am veteran the fan
Cluster 1: i in way a college get more now news life
Cluster 2: #maga manager one beer free but all or school out
Cluster 3: good a and of for food the love i at
Cluster 4: enthusiast and fan of writer sports mom the dad cat
Cluster 5: ’ all you i <-url-> to in are s 🌈
Cluster 6: no i and in for you have to a is
Cluster 7: the to is of in and i with a on
Cluster 8: account twitter fan of new a and the my this
Cluster 9: • black matter lives her she like and alum to
Cluster 10: a of and in the is i to one on
Cluster 11: ️ <-url-> and of to 🇺 the i 🇸 in
Cluster 12: ❤ ️ 💙 my i the and of to <-url->
Cluster 13: <-url-> the and of for to in on at or
Cluster 14: with a and the of people to for my i
Cluster 15: the of and to on is for one a in
Cluster 16: her she of and a the i to mom my
Cluster 17: love and is to my all the family <-url-> god
Cluster 18: american proud and the of fan in to for people
Cluster 19: 💙 <-url-> ️ and ❤ of 🇸 🇺 i #blacklivesmatter
Cluster 20: fan dad coach of sports nerd and at veteran founder
Cluster 21: and to for who in fan the more on i
Cluster 22: • of she her ️ the <-url-> my a ’
Cluster 23: and of all things the lover fan writer proud in
Cluster 24: marketing and digital of writer content in media sports a
Cluster 25: #blacklivesmatter #resist i <-url-> of and ️ are owner the
Cluster 26: is my a the and who of but to in
Cluster 27: we are the all to our and it have for
Cluster 28: i'm a not i just and on of to but
Cluster 29: <-url-> of host out on and founder now the in
Cluster 30: just a to me i the and not for my
Cluster 31: 🌊 #resist 💙 mom to 🇺 a the love 🇸
Cluster 32: too to and a i way for of this about
Cluster 33: alum fan of at the and ️ to tech mom
Cluster 34: ✨ she her ️ of ig love my to and
Cluster 35: matter lives black she her all the i and of
Cluster 36: not are of but and i my do for tweets
Cluster 37: new and of for in the a day sports with
Cluster 38: teacher school retired and former in for mom writer to
Cluster 39: she her <-url-> in and the a of fan •
Cluster 40: them they and of a i he in is him
Cluster 41: ️‍ 🏳 🌈 ️ she her he 🇺 him 🇸
Cluster 42: born and at the a on to of writer i
Cluster 43: “ ” the of a is i to in ’
Cluster 44: mother wife of and lover teacher the for a retired
Cluster 45: father husband and to of a the fan my proud
Cluster 46: by god of the american life jesus who is husband
Cluster 47: fan of sports and football the a to beer <-url->
Cluster 48: tweets are my own and of not i the a
Cluster 49: proud own my are views tweets and mom grad marketing
Cluster 50: professional of and sports to for the <-url-> marketing ️
Cluster 51: dad husband and of fan to the christian in at
Cluster 52: s sports ’ <-url-> news politics retired for that it
Cluster 53: at the and <-url-> of writer for my ️ all
Cluster 54: coach dad husband of and alum football sports at college
Cluster 55: views own alum my are he him and in all
Cluster 56: like i and my to you a the things on
Cluster 57: his he him of the and a for in i
Cluster 58: don t ’ i you me the my if to
Cluster 59: ‘ writer ’ her she of and ️ fan in
Cluster 60: him he #blacklivesmatter of and the ️ <-url-> writer i
Cluster 61: am i a and the of not who to <-url->
Cluster 62: father husband and of coach friend fan christian in author
Cluster 63: to a my and i of trying all be have
Cluster 64: girl mom the and love on my of in living
Cluster 65: truth the and of justice to for my a in
Cluster 66: everything is and for i the on about sports a
Cluster 67: 🇺 🇸 ️ and #maga the ❤ america god of
Cluster 68: mom wife and of to in lover dog the friend
Cluster 69: twitter i to my on and the have not a
Cluster 70: than more better a i is and of to in
Cluster 71: up to and the a my for i'm no don't
Cluster 72: since and the fan for by i of in to
Cluster 73: best the is in to and of for i my
Cluster 74: grad student of and at the to lover <-url-> proud
Cluster 75: own opinions my are and all of for not i
Cluster 76: 🏻 ‍ ️ 🇸 🇺 to and for a on
Cluster 77: never the a is will to and you of be
Cluster 78: twitter on a of the is no to <-url-> me
Cluster 79: a and i for to with have the is in
Cluster 80: also and i a at the of in for on
Cluster 81: a and with in fan is my i'm me <-url->
Cluster 82: be to i a not the can my want you
Cluster 83: 1 she her of 3 political mother wife in you
Cluster 84: <-url-> for and i in at a writer new to
Cluster 85: here just for to the and i'm you be at
Cluster 86: will be the i you and not this to of
Cluster 87: about and i tweet the politics a of my sports
Cluster 88: de la a no ’ ️ me digital director ig
Cluster 89: follow me on back i to and my <-url-> a
Cluster 90: designer and of him he artist she ig lover her
Cluster 91: lover animal dog of and mom writer fan proud friend
Cluster 92: man a the of and family in with old one
Cluster 93: junkie political news and fan sports lover enthusiast of mom
Cluster 94: #resist no the of artist fan retired with progressive enthusiast
Cluster 95: i a my and was the can things to have
Cluster 96: la a de ️ no <-url-> the writer and sports
Cluster 97: art and of culture in food artist travel science music
Cluster 98: an of in and not the a with is to
Cluster 99: for the and in all of to i on is
Cluster 100: state university of and alum the a at is fan
Cluster 101: get to i trying and a the my me just
Cluster 102: live and life love the in for to i music
Cluster 103: ‍ ️ ❤ 🇸 🇺 🏻 i and to of
Cluster 104: 🏾 ‍ ️ black my <-url-> of the in i
Cluster 105: director creative of writer and founder the <-url-> for at
Cluster 106: me the a and <-url-> for you my don't to
Cluster 107: ‘ ’ university alum <-url-> of college football a dad
Cluster 108: ig <-url-> of • in #blacklivesmatter owner food my ‘
Cluster 109: country god family our and trump of the for my
Cluster 110: member of the proud and life <-url-> a in fan
Cluster 111: proud democrat and of liberal dad in father conservative trump
Cluster 112: your to the a you on for i and of
Cluster 113: lives matter black he him his and of i for
Cluster 114: history politics and of music the in science nerd about
Cluster 115: of founder jesus father and all lover the husband host
Cluster 116: in of the a and is our world school on
Cluster 117: former of for and alum in fan <-url-> the writer
Cluster 118: work in a my to of i and on is
Cluster 119: family my and love the in of travel all to
Cluster 120: sometimes i and make for but a the like you
Cluster 121: god family is in of i and all to america
Cluster 122: girl a just in world living to the with my
Cluster 123: s she ’ her and i a my the up
Cluster 124: university of at and the for school alum <-url-> grad
Cluster 125: for and of a all my writer tweets to <-url->
Cluster 126: black the conservative and proud me ig people with on
Cluster 127: political and of science for a in news the media
Cluster 128: <-url-> a and i the ️ ig host in of
Cluster 129: in the and a i my to for all <-url->
Cluster 130: social media of and for manager <-url-> the marketing her
Cluster 131: not the of to but is and in that a
Cluster 132: editor writer and <-url-> of former in to director media
Cluster 133: views own my are and of all not the here
Cluster 134: the of and is in when on i not football
Cluster 135: music and sports politics food travel love <-url-> film of
Cluster 136: 4 of mom to life husband wife a the father
Cluster 137: love and my the i a of to in football
Cluster 138: life your live is and best you in to for
Cluster 139: life is my of a the and to in for
Cluster 140: politics sports and the in progressive science a my i
Cluster 141: advocate for of and mom the enthusiast in to proud
Cluster 142: us and to the in our of for veteran we
Cluster 143: this is a account for the i and of not
Cluster 144: by day and the of for at in a writer
Cluster 145: 3 of kids married and mom father in 2 to
Cluster 146: do i not to what the my and of things
Cluster 147: 2 of to dad the mom 1 a 3 and
Cluster 148: editor video news and writer of <-url-> for podcast host
Cluster 149: blm she her they of and 🌈 a the #blacklivesmatter
Cluster 150: it i a was the and on to me but
Cluster 151: photographer designer and of alum former lover artist at news
Cluster 152: — the of and she art her is on in
Cluster 153: living in the life and a of for to i
Cluster 154: fan and of music college mom writer nerd the manager
Cluster 155: retired from mother democrat of <-url-> blue and dog lover
Cluster 156: real and the my in is a i to life
Cluster 157: student of college and at her she the alum life
Cluster 158: mine are opinions views tweets of and all her she
Cluster 159: go and blue of a the to on i my
Cluster 160: m ’ i here a just the for and not
Cluster 161: #blm #resist democrat in and former all the fan of
Cluster 162: t ’ can i a to it you like of
Cluster 163: news and the sports for in politics <-url-> from world
Cluster 164: business owner and a of to in the mom <-url->
Cluster 165: 2020 trump the of in my for not <-url-> i
Cluster 166: sports and fan for on of i host about all
Cluster 167: be it to all the is a trying will in
Cluster 168: christian conservative of love husband american at wife proud father
Cluster 169: big fan of a the and to for with guy
Cluster 170: world the in to a and of make is trying
Cluster 171: born in and living the of a fan live i
Cluster 172: if you a i it to me the your ’
Cluster 173: producer writer and director tv for music video the <-url->
Cluster 174: as the i a and to you of for are
Cluster 175: i m ’ a my and the to not in
Cluster 176: president of the and is to in trump for a
Cluster 177: books of author and the <-url-> music film a lover
Cluster 178: on the a <-url-> of and to my in host
Cluster 179: stuff and i about other tweet sports of a things
Cluster 180: games video and i sports music of a politics about
Cluster 181: s ’ the of a that to in for my
Cluster 182: you the to what know can i a are of
Cluster 183: human a of animal and <-url-> right in advocate also
Cluster 184: a not is but to with be or of i
Cluster 185: artist writer and of a in <-url-> the photographer w
Cluster 186: it is the to like of or you and a
Cluster 187: guy just a the who that and in of to
Cluster 188: they she he and a your the i of me
Cluster 189: love i and my to the a in music things
Cluster 190: there is no to the in be that a out
Cluster 191: #blacklivesmatter her she my the a ️ writer 🌈 mom
Cluster 192: time at a one the to of day and my
Cluster 193: s it ’ i to the “ ” not and
Cluster 194: author of the and <-url-> book writer for in on
Cluster 195: him he of and the in a for #blacklivesmatter i
Cluster 196: from the and of to a with all that <-url->
Cluster 197: 1 2 the of fan in 3 news former and
Cluster 198: 2 i and for <-url-> a to dogs by married
Cluster 199: lover music of and the food beer all life world

Reducing Dimensionality for Plotting

def maybe_fit_tsne():
file = "data/full_bio_matrix_2d.npy"
bio_matrix_2d = np.load(file)
logging.warning("loading cached TSNE file")
except FileNotFoundError:
logging.warning("Fitting TSNE")
tsne = TSNE(n_components=2,
bio_matrix_2d = tsne.fit_transform(bio_matrix.todense())

np.save(file, bio_matrix_2d)
return bio_matrix_2d
bio_matrix_2d = maybe_fit_tsne()
WARNING:root:loading cached TSNE file
CPU times: user 3.16 ms, sys: 2.69 ms, total: 5.85 ms
Wall time: 27.5 ms

Visualizing Our Data

def build_plottable_dataframe(users: list, bios: list, coord: object, labels: list):
num_labels = len(set(labels))
colors = sns.color_palette('hls', num_labels).as_hex()
color_lookup = {v:k for k,v in zip(colors, set(labels))}
df = pd.DataFrame({
'user_name': users,
'text': bios,
'x_val': coord[:,0],
'y_val': coord[:,1],
'cluster': labels
df['color'] = list(map(lambda x: color_lookup[x], labels))
return df
def plot_cluster(df, title='t-SNE plot'):
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(df)
# configure the chart
tsne_plot = figure(title=title, plot_width=800, plot_height=700, tools=('pan, box_zoom, reset'))
# add a hover tool to display words on roll-over
HoverTool(tooltips = """<div style="width: 400px;"><strong>Cluster: @cluster</strong> | <u>User Name: @user_name</u> | <i>Bio: @text</i></div>""")
# draw the words as circles on the plot
tsne_plot.circle('x_val', 'y_val',
# configure visual elements of the plot
tsne_plot.title.text_font_size = '12pt'
tsne_plot.xaxis.visible = True
tsne_plot.yaxis.visible = True
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None
return tsne_plot
df = build_plottable_dataframe(users, bios, bio_matrix_2d, kn_model.labels_)show(plot_cluster(df, 'Projection of K Means Clustered Super Bowl Users'))

Cluster Insights


trending towards data science; lover of python; trying to improve every day. Check it out → https://github.com/drewipson

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store