Use Sentence transformers to embed the sentences to search for subjects within tweets.
This is an experimental approach to see how themes persist in the data over a long period.
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import pickle
import re
import string
import warnings
warnings.filterwarnings("ignore")
MIN_VAL = 0.3
COUNTRY2 = "CA"
LOAD_FRESH = False # if False, use the cached version of the data (much quicker!)
TOPIC_MODEL = False # Use the topics from the topic model run -- must also use LOAD_FRESH = True
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
# Set the random seed to a fixed value
torch.manual_seed(42)
<torch._C.Generator at 0x1d162f60170>
# Stopwords - Use a more robust list
##from nltk.corpus import stopwords
##stopwords = stopwords.words('english')
stopwords = [ "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz" ]
stopwords = [i.replace('"',"").strip() for i in stopwords]
stopwords = set( [s.lower().strip() for s in stopwords] )
def get_embeddings(text):
"""
Split texts into sentences and get embeddings for each sentence.
The final embeddings is the mean of all sentence embeddings.
:param text: str. Input text.
:return: np.array. Embeddings.
"""
##return model.encode( text )
return np.mean(
model.encode(
list(set(re.findall('[^!?.]+[!?.]?', text)))
), axis=0)
def clean_txt(text, remove_punct=False):
# Remove URLs
text = re.sub(r'http\S+', '', text)
# Remove non-ASCII characters
text = text.encode('ascii', 'ignore').decode()
# Remove punctuation - KEEP for get_embedding()
if remove_punct:
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove numbers -- Keep for 5G and related
##text = re.sub(r'\d+', '', text)
# Convert to lowercase
#text = text.lower()
text = ' '.join([word.replace('#','').replace('@','').replace('_','') for word in text.split() if ( word.lower() not in stopwords and 'http' not in word )])
# Remove extra whitespaces
text = re.sub(r'\s+', ' ', text).strip()
return text
import numpy as np
from sentence_transformers import SentenceTransformer
def euclidean_distance(u: np.ndarray, v: np.ndarray) -> float:
"""Calculates the Euclidean distance between two vectors"""
return np.linalg.norm(u - v)
def manhattan_distance(u: np.ndarray, v: np.ndarray) -> float:
"""Calculates the Manhattan distance between two vectors"""
return np.sum(np.abs(u - v))
def minkowski_distance(u: np.ndarray, v: np.ndarray, p: int = 2) -> float:
"""Calculates the Minkowski distance between two vectors"""
return np.sum(np.abs(u - v) ** p) ** (1 / p)
def cosine_similarity(a, b):
"""
Computes the cosine similarity between two vectors.
Args:
a (numpy array): The first vector.
b (numpy array): The second vector.
Returns:
The cosine similarity between the two vectors.
"""
cs = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
#print( f"CS:{round(cs,3)}" )
return cs
## FIX mycounter to take a list of all the names and then set proportions of each theme
def mycounter( txt, theme_name, all_themes ):
lst = txt.split('|')
mycounts = {}
for atheme in all_themes:
mycounts[ atheme ] = lst.count(atheme)
total = float( sum( mycounts.values() ) )
for atheme in all_themes:
if total > 0:
mycounts[ atheme ] = mycounts[ atheme ] / total
else:
mycounts[ atheme ] = 0.0
return mycounts[ theme_name ]
# Calculate the cosine similarity between each sentence and each theme
def get_themes( sentence, sentence_embedding ):
#print("----")
#print(f"Sentence: {sentence}")
if len( sentence.strip() ) < 15 or len( clean_txt(sentence,remove_punct=True) ) == 0:
return ""
best = []
for j, theme in enumerate(themes):
se = sentence_embedding
te = theme_embeddings[j]
similarity = cosine_similarity(te,se)
if similarity <= MIN_VAL:
continue
if len(best) < 10 or similarity > best[0][0]:
val = ( similarity, theme )
best.append( val )
best = sorted( best, key=lambda x: x[0], reverse=True )
if len( best ) > 10:
best = best[0:10]
#print(f" Similarity to '{theme}': {similarity}")
best = [ resolver[t[1]] for t in best] # Make nice format
return '|'.join(best)
The basic themes are all the same. Some changes are made for consistency. Discussions about Canada in Canada do not count as a discussion about an ally. BUT discussions about Canada in the USA or the UK do. Similarly, the names of the major telcos in each country are added to the business category.
# Load pre-trained sentence transformer model
#model = SentenceTransformer('bert-base-uncased') # better but expensive
##model = SentenceTransformer('paraphrase-mpnet-base-v2')
model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder="cache") # smaller
print("Max Sequence Length:", model.max_seq_length)
# ['Allies','Business','Politics','Security','Technology', 'HuaweiPR']
key_themes = {
'Business' :['economics','business', 'innovation','GDP','trade','trade war','trade deal','commercial'],
'Security' :['national security','security','espionage','spying','ban','banning','surveillance','backdoor'],
'Technology':['technology','intellectual property','5G networks',],
'Allies' :['Canada','CA','UK','New Zealand','NZ','Australia','AU','United Kingdom','United States','US','USA','five eyes','alliance',
'Morrison','Trudeau','Trump','Arden','Boris Johnson'],
'Huawei PR' :['android','apple','autodesk','autonomous','battery','camera','cities','fastcharger','fitness','giveaway','headset','industrial','iot','laptop','latest','lens','market','matebook','matepad','mateview','monitor','neweggcan','notebook','oracle','patent','phone','prizes','projector','revenue','smartwatch','supported','tablet''tracker','wifi']
}
if COUNTRY2 == "CA":
key_themes['Politics'] = ['politics', 'leadership','Charest','PierrePoilievre','Poilievre','Trudeau','Justin Trudeau', 'Liberal','Conservative','JeanCharest','JustinTrudeau','Scheer']
key_themes['Business'].extend(["Telus","Bell",'Quebecor','Shaw','Rogers',"Ericsson","Samsung","Nokia"])
key_themes['Allies'].remove('CA')
key_themes['Allies'].remove('Canada')
#key_themes['Allies'].remove('Meng Wanzhou')
key_themes['Allies'].remove('Trudeau')
if COUNTRY2 == "AU":
key_themes['Politics'] = ['politics', 'leadership','Labour','Liberal','Conservative','MPs','sponsor','travel','overseas travel','Scott Morrison']
key_themes['Business'].extend(["Telstra","Optus",'TPG','VHA',"Ericsson","Samsung","Nokia"])
key_themes['Allies'].remove('Australia')
key_themes['Allies'].remove('AU')
key_themes['Allies'].remove('Morrison')
if COUNTRY2 == "US":
key_themes['Politics'] = ['politics', 'leadership','Liberal','Conservative','GOP','Democrat','Republican','Democratic Party','Trump','Donald Trump','DonaldTrump']
key_themes['Business'].extend(["Verizon","T-Mobile",'AT&T',"Ericsson","Samsung","Nokia"])
key_themes['Allies'].remove('USA')
key_themes['Allies'].remove('US')
key_themes['Allies'].remove('United States')
key_themes['Allies'].remove('Trump')
if COUNTRY2 == "NZ":
key_themes['Politics'] = ['politics','leadership','Labour','National','Greens','Maori Party','ACT','Jacinda Arden','PM','Arden','Hipkins','NZ First']
key_themes['Business'].extend(["Spark","Vodaphone",'2degrees',"Ericsson","Samsung","Nokia"])
key_themes['Allies'].remove('New Zealand')
key_themes['Allies'].remove('NZ')
key_themes['Allies'].remove('Arden')
if COUNTRY2 == "UK":
key_themes['Politics'] = ['politics', 'leadership','Labour','Liberal','Conservative','MPs','BorisJohnson','Boris Johnson','rebellion','Tory']
key_themes['Business'].extend(["BT","Virgin",'Sky','Vodaphone',"Ericsson","Samsung","Nokia"])
key_themes['Allies'].remove('UK')
key_themes['Allies'].remove('United Kingdom')
key_themes['Allies'].remove('Boris Johnson')
# TEST - probably remove this key_themes
##key_themes = {}
# Canada - topics
# Huawei as security threat
##key_themes['Security Threat'] = ['5g','bell','canada','china','compensation','conservative','crimes','decision','enforcement','equipment','ericsson','fed','hack','justintrudeau','kovrig','leader','liberal','meng','pierrepoilievre','premier','putin','security''signal','spies','surveillance','taxpayers','telecom','telus','trudeau','uyghur','wanzhou']
# Huawei as a legitimate company. i.e. normal PR
##key_themes['Huawei PR'] = ['android','apple','autodesk','autonomous','battery','camera','cities','fastcharger','fitness','giveaway','headset','industrial','iot','laptop','latest','lens','market','matebook','matepad','mateview','monitor','neweggcan','notebook','oracle','patent','phone','prizes','projector','revenue','smartwatch','supported','tablet''tracker','wifi']
# resolver is a dictionary that takes any search term and looks up the general topic it belongs to
resolver = {}
for key in key_themes.keys():
vlst = key_themes.get( key, [] )
for item in vlst:
resolver[ item ] = key
# Make a single list of thematic words to search for
themes = []
for lst in key_themes.values():
themes.extend( lst )
# Get embeddings for the themes
theme_embeddings = model.encode(themes)
Max Sequence Length: 256
if LOAD_FRESH:
# Open main file
if TOPIC_MODEL:
df = pd.read_csv(open(f"../topics/topics_{COUNTRY2}.csv",'r')) # For topic data
else:
df = pd.read_csv(open(f"../location/full_{COUNTRY2}.csv",'r')) # For full timeline
#pass # I don't want to load this by accident -- this takes forever to run!
df['text'] = df['cleaned_text'].apply( clean_txt ) # Clean text
sentences_all = df['cleaned_text'].values
sentences = df['text'].values
print( sentences[0] ) # Display first sentence for sanity check
# Get embeddings for the sentences
print("Getting sentence embeddings...")
df['sentence_embedding'] = df['text'].apply( get_embeddings )
print("Get sentence embeddings... done")
def infer_central_phrase(embeddings, tokenizer, sentences):
# Compute pairwise cosine similarities between sentence embeddings
similarity_matrix = cosine_similarity(embeddings)
# Compute average cosine similarity to all other sentence embeddings for each sentence embedding
avg_similarities = np.mean(similarity_matrix, axis=1)
# Find the sentence embedding with the highest average cosine similarity
central_index = np.argmax(avg_similarities)
central_embedding = embeddings[central_index]
# Convert central_embedding to token IDs
central_ids = np.argmax(central_embedding, axis=1).tolist()
# Convert token IDs to tokens and then decode into text
central_tokens = tokenizer.convert_ids_to_tokens(central_ids)
central_phrase = tokenizer.convert_tokens_to_string(central_tokens)
# Retrieve corresponding sentence from original sentences array
central_sentence = sentences[central_index]
return central_phrase, central_sentence
#infer_central_phrase()
if LOAD_FRESH:
# Use the resolver dict made above to get the number of themes in a sentence
df['results'] = df.apply( lambda x: get_themes( x.text, x.sentence_embedding ), axis=1 )
for t in key_themes.keys():
df[t] = df.apply(lambda x: mycounter(x.results,t,key_themes.keys()), axis=1 )
#df.drop( 'results', axis=1, inplace=True )
#df.drop( 'sentence_embedding', axis=1, inplace=True )
if TOPIC_MODEL:
print("Writing topic data")
df.to_csv(open(f"out_topic_{COUNTRY2}.csv",'w',newline='\n')) # Write results out to file for later processing
else:
df.to_csv(open(f"out_full_{COUNTRY2}.csv",'w',newline='\n')) # Write results out to file for later processing
##COUNTRY2 = "UK"
key_themes_lst = ['Business','Politics','Security','Technology','Allies','Huawei PR'] # Main themes defined
if not LOAD_FRESH:
if TOPIC_MODEL:
print("Reading topic model cached data")
#df = pd.read_csv(open(f"out_topic_{COUNTRY2}.csv",'r')) # Read cached data from topic model run
df = pd.read_csv(open(f"topics_{COUNTRY2}.csv",'r'))
key_themes_lst.append( 'topic' ) # Some additional columns to keep
else:
print("Reading full time cached data")
df = pd.read_csv(open(f"out_full_{COUNTRY2}.csv",'r')) # READ cached file!
key_themes_lst.append( 'name' )
key_themes_lst.append( 'date' )
# Read the output file
df = df[ key_themes_lst ] # just keep the key_theme columns (and extras)
# Convert date column to datetime object. Set it to the start of the week period.
df["week"] = pd.to_datetime(df["date"]).dt.to_period('W').dt.start_time
df["month"] = pd.to_datetime(df["date"]).dt.to_period('M').dt.start_time
# Write the results to be used in an Excel template for visualization
df.to_csv(open(f'piv_{COUNTRY2}.csv','w',newline='\n'))
Reading full time cached data
df.columns
Index(['Business', 'Politics', 'Security', 'Technology', 'Allies', 'Huawei PR', 'name', 'date', 'week', 'month'], dtype='object')
df = df.sort_values(by='month', ascending=True)
pf = pd.pivot_table( df, index=['month'] ,aggfunc=np.sum )
pf = pf.reset_index()
if 'topic' in pf.columns:
pf.drop('topic', axis=1, inplace=True)
import matplotlib.pyplot as plt
#%matplotlib widget
%matplotlib inline
# CNet timeline https://www.cnet.com/news/privacy/huawei-ban-timeline-detained-cfo-makes-deal-with-us-justice-department/
ax = pf.plot.area(x='month', stacked=True)
#plt.stackplot(pf['month'], pf['Business'],pf['Security'],pf['Politics'],pf['Allies'],pf['Technology'], labels=['Bus','B','C'])
ymin, ymax = plt.ylim()
TEN_PERC = round( ymax * 0.10)
THREE_PERC = round( ymax * 0.03)
TOP_LABEL = ymax - TEN_PERC
MID_LABEL = round( ymax / 2 ) + TEN_PERC
LOW_LABEL = ymin + THREE_PERC
print( TOP_LABEL )
# Ban dates for all the five eyes
dd = {'2022-05-17':('CA',''),'2018-07-11':('AU',''),"2019-05-15":('US','r'),"2020-07-14":('UK','')}
for dt, i in dd.items():
align_dir = "right" if i[1] != 'r' else 'left'
ax.axvline( pd.to_datetime(dt), color='black', linestyle=':', lw=1, alpha=0.5) # Canada
ax.annotate(f' {i[0]} \n ban ', xy=(dt, TOP_LABEL ) ,horizontalalignment=align_dir, fontsize=10 )
# Australia events
if COUNTRY2 in ['AU','CA','US','UK']:
ax.axvline( pd.to_datetime('2021-09-24'), color='black', linestyle=':', lw=1) # Canada 2Michaels returned
ax.annotate(' Two\n Michaels\n Return', xy=('2021-09-24', MID_LABEL) )
ax.axvline( pd.to_datetime('2018-12-01'), color='white', linestyle='--', lw=1) # Canada Meng Wanzhou arrested
ax.annotate(' Huawei\n CFO\n arrested', xy=('2018-12-01', LOW_LABEL), color='white', fontweight='bold' )
# UK events
if COUNTRY2 in ['UK','CA','US','NZ']:
ax.axvline( pd.to_datetime('2020-01-28'), color='red', linestyle=':', lw=1 , alpha=0.5) # UK Allows Huawei to build network
ax.annotate(' UK to \n allow \n Huawei ', xy=('2020-01-28', MID_LABEL), color='red', horizontalalignment='right' )
# Jan. 29, 2020 - Europe allows Huawei for 5G through security guidelines
ax.axvline( pd.to_datetime('2020-01-29'), color='red', linestyle=':', lw=1, alpha=0.5) # Jan. 29, 2020 - Europe allows Huawei for 5G through security guidelines
ax.annotate(' EU to \n allow \n Huawei ', xy=('2020-01-29', TOP_LABEL*.85), color='red' ,horizontalalignment='right' )
# Feb 11, 2020 - US finds Huawei has backdoor access to mobile networks globally, report says
ax.axvline( pd.to_datetime('2020-02-11'), color='red', linestyle=':', lw=1, alpha=0.5) # Feb 11, 2020 - US finds Huawei has backdoor access to mobile networks globally, report says
ax.annotate(' Huawei \n backdoors \n found ', xy=('2020-02-11', MID_LABEL), color='red' )
if COUNTRY2 != 'NZ':
ax.axvline( pd.to_datetime('2020-03-10'), color='white', linestyle='--', lw=1) # UK MP rebellion
ax.annotate(' UK\n MPs \n lose \n vote', xy=('2020-03-10', LOW_LABEL), color='white', fontweight='bold' )
# US events
if COUNTRY2 in ['US','CA','AU']:
# Nov 22, 2019 - Huawei banned from US subsidies
#ax.axvline( pd.to_datetime('2019-11-22'), color='red', linestyle=':', lw=1, alpha=0.5) # Jan. 29, 2020 - Europe allows Huawei for 5G through security guidelines
#ax.annotate(' US \n subsidies \n ban ', xy=('2019-11-22', TOP_LABEL*.85), color='red' ,horizontalalignment='right' )
ax.fill_between(['2019-10-16', '2020-02-11'], [ymax,ymax], color='gray', alpha=0.2) # ratcheting up of tensions
ax.axvline( pd.to_datetime('2020-12-04'), color='red', linestyle=':', lw=1) # US deal to release Meng?
ax.annotate(' Trump \n alludes to \n release of \n Huawei \n CFO ', xy=('2020-12-04', MID_LABEL), color='red' , horizontalalignment='left' )
# AU events
if COUNTRY2 in ["AU"]:
ax.axvline( pd.to_datetime('2018-08-27'), color='red', linestyle=':', lw=1) # Turnbull resigns
ax.annotate(' Turnbull\n resigns ', xy=('2018-08-24', (TOP_LABEL*0.9)), color='red' )
ax.axvline( pd.to_datetime('2018-06-06'), color='red', linestyle=':', lw=1) # FB shared data with Huawei scandal
ax.annotate(' FB \n shares \n data ', xy=('2018-06-06', MID_LABEL), color='red', horizontalalignment='right' )
# UK announces to allow Huawei into network before backpedalling
ax.axvline( pd.to_datetime('2020-02-28'), color='red', linestyle=':', lw=1) # UK Allows Huawei to build network
ax.annotate(' UK \n allows \n Huawei ', xy=('2020-02-28', MID_LABEL), color='red' , horizontalalignment='right' )
# NZ events
if COUNTRY2 == "NZ":
# https://www.sparknz.co.nz/news/GCSB_declines_Spark_proposal_Huawei/
ax.axvline( pd.to_datetime('2018-11-28'), color='white', linestyle='--', lw=1, alpha=1) # GCSB blocks use of Huawei by Spark
ax.annotate(' GCSB \n block ', xy=('2018-11-28', LOW_LABEL*.5), color='white', horizontalalignment='right',fontsize=10, weight='bold' )
# https://www.stuff.co.nz/business/112486350/huawei-nz-we-know-the-rumours-gossip-and-innuendo-about-us?rm=m
ax.axvline( pd.to_datetime('2019-05-06'), color='white', linestyle='--', lw=1) # Huawei PR
ax.annotate(' Huawei \n PR ', xy=('2019-05-06', 40), color='white', weight='bold' )
# Reports of NZ tensions with China
ax.axvline( pd.to_datetime('2019-02-12'), color='white', linestyle='--', lw=1) #
ax.annotate(' NZ-China \n tension ',
bbox=dict(boxstyle="round", fc="1"), # foreground color?
xytext=('2019-01-01', MID_LABEL*1.25),
color='black', horizontalalignment='right',
xy=('2019-02-12', 400),
arrowprops=dict(
arrowstyle="->",
#connectionstyle="bar",
connectionstyle="arc",
#ec="k",
shrinkA=0, shrinkB=5
)
)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.title(f'Themes about Huawei discussed over time ({COUNTRY2})')
plt.xlabel('Date')
plt.ylabel('Theme discussed')
import matplotlib
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(7.5, 5)
plt.savefig(f'stacked_area_{COUNTRY2}.pdf', bbox_inches='tight') # saves the current figure
plt.show()
10500.599999999999
set( df.topic )
# This only works when the data has topics from the BERTopic model
if TOPIC_MODEL:
print( "Running!" )
# Now lets see overlap between categories and
key_themes_lst = ['Allies','Business','Politics','Security','Technology']
df = df.sort_values(by='month', ascending=True)
sdf = pd.pivot_table( df, index=['topic','name'] ,aggfunc=np.sum )
sdf['row_sum'] = sdf[key_themes_lst].sum(axis=1)
#sdf.drop('index', axis=1, inplace=True)
#sdf.drop('topic', axis=1, inplace=True)
sdf.drop('row_sum', axis=1, inplace=True)
sdf[key_themes_lst] = sdf[key_themes_lst].div(sdf[key_themes_lst].sum(axis=1), axis=0) # normalize
sdf[key_themes_lst] = sdf[key_themes_lst].round(decimals=2
)
#sdf.style.set_precision(2).background_gradient(cmap='RdYlGn',axis=1,vmin=0, vmax=1)
sdf_styled = sdf.style.set_precision(2).background_gradient(cmap='Blues',axis=1,vmin=0, vmax=1)
sdf_styled
# Write out the styled table to disk
#!python -m pip install dataframe_image
import dataframe_image as dfi
dfi.export(sdf_styled, f'{COUNTRY2}_theme_table_styled.png')
sdf_styled
False