Topic Search¶

Use Sentence transformers to embed the sentences to search for subjects within tweets.

This is an experimental approach to see how themes persist in the data over a long period.

In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import pickle
import re
import string
import warnings
warnings.filterwarnings("ignore")

MIN_VAL = 0.3
COUNTRY2 = "CA"
LOAD_FRESH = False # if False, use the cached version of the data (much quicker!)
TOPIC_MODEL = False # Use the topics from the topic model run -- must also use LOAD_FRESH = True

import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch

# Set the random seed to a fixed value
torch.manual_seed(42)
Out[1]:
<torch._C.Generator at 0x1d162f60170>
In [2]:
# Stopwords - Use a more robust list

##from nltk.corpus import stopwords
##stopwords = stopwords.words('english')

stopwords = [ "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz" ]
stopwords = [i.replace('"',"").strip() for i in stopwords]
stopwords = set( [s.lower().strip() for s in stopwords] )
In [3]:
def get_embeddings(text):
    """
    Split texts into sentences and get embeddings for each sentence.
    The final embeddings is the mean of all sentence embeddings.
    :param text: str. Input text.
    :return: np.array. Embeddings.
    """
    ##return model.encode( text )
    return np.mean(
        model.encode(
            list(set(re.findall('[^!?.]+[!?.]?', text)))
        ), axis=0)

def clean_txt(text, remove_punct=False):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove non-ASCII characters
    text = text.encode('ascii', 'ignore').decode()
    # Remove punctuation - KEEP for get_embedding()
    if remove_punct:
        text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers -- Keep for 5G and related
    ##text = re.sub(r'\d+', '', text)
    # Convert to lowercase
    #text = text.lower()
    text = ' '.join([word.replace('#','').replace('@','').replace('_','') for word in text.split() if ( word.lower() not in stopwords and 'http' not in word )])
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

import numpy as np
from sentence_transformers import SentenceTransformer


def euclidean_distance(u: np.ndarray, v: np.ndarray) -> float:
    """Calculates the Euclidean distance between two vectors"""
    return np.linalg.norm(u - v)

def manhattan_distance(u: np.ndarray, v: np.ndarray) -> float:
    """Calculates the Manhattan distance between two vectors"""
    return np.sum(np.abs(u - v))

def minkowski_distance(u: np.ndarray, v: np.ndarray, p: int = 2) -> float:
    """Calculates the Minkowski distance between two vectors"""
    return np.sum(np.abs(u - v) ** p) ** (1 / p)

def cosine_similarity(a, b):
    """
    Computes the cosine similarity between two vectors.

    Args:
    a (numpy array): The first vector.
    b (numpy array): The second vector.

    Returns:
    The cosine similarity between the two vectors.
    """
    cs = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    #print( f"CS:{round(cs,3)}" )
    return cs


## FIX mycounter to take a list of all the names and then set proportions of each theme
def mycounter( txt, theme_name, all_themes ):
    lst = txt.split('|')
    mycounts = {}
    for atheme in all_themes:
        mycounts[ atheme ] = lst.count(atheme)
    total = float( sum( mycounts.values() ) )
    for atheme in all_themes:
        if total > 0:
            mycounts[ atheme ] = mycounts[ atheme ] / total
        else:
            mycounts[ atheme ] = 0.0
    return mycounts[ theme_name ]
        
# Calculate the cosine similarity between each sentence and each theme
def get_themes( sentence, sentence_embedding ):
    #print("----")
    #print(f"Sentence: {sentence}")
    if len( sentence.strip() ) < 15 or len( clean_txt(sentence,remove_punct=True) ) == 0:
        return ""
    best = []
    for j, theme in enumerate(themes):
        se = sentence_embedding
        te = theme_embeddings[j]
        similarity = cosine_similarity(te,se)
        if similarity <= MIN_VAL:
            continue
        if len(best) < 10 or similarity > best[0][0]:
            val = ( similarity, theme )
            best.append( val )
            best = sorted( best, key=lambda x: x[0], reverse=True )
            if len( best ) > 10:
                best = best[0:10]
        #print(f"    Similarity to '{theme}': {similarity}")
    best = [ resolver[t[1]]  for t in best] # Make nice format
    return '|'.join(best)

Each country has its own political terms set to account for scandals and leaders¶

The basic themes are all the same. Some changes are made for consistency. Discussions about Canada in Canada do not count as a discussion about an ally. BUT discussions about Canada in the USA or the UK do. Similarly, the names of the major telcos in each country are added to the business category.

In [5]:
# Load pre-trained sentence transformer model
#model = SentenceTransformer('bert-base-uncased') # better but expensive
##model = SentenceTransformer('paraphrase-mpnet-base-v2')
model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder="cache") # smaller
print("Max Sequence Length:", model.max_seq_length)

# ['Allies','Business','Politics','Security','Technology', 'HuaweiPR']
key_themes = {
    'Business' :['economics','business', 'innovation','GDP','trade','trade war','trade deal','commercial'],
    'Security'  :['national security','security','espionage','spying','ban','banning','surveillance','backdoor'],
    'Technology':['technology','intellectual property','5G networks',],
    'Allies' :['Canada','CA','UK','New Zealand','NZ','Australia','AU','United Kingdom','United States','US','USA','five eyes','alliance',
              'Morrison','Trudeau','Trump','Arden','Boris Johnson'],
    'Huawei PR' :['android','apple','autodesk','autonomous','battery','camera','cities','fastcharger','fitness','giveaway','headset','industrial','iot','laptop','latest','lens','market','matebook','matepad','mateview','monitor','neweggcan','notebook','oracle','patent','phone','prizes','projector','revenue','smartwatch','supported','tablet''tracker','wifi']

}

if COUNTRY2 == "CA":
    key_themes['Politics'] = ['politics', 'leadership','Charest','PierrePoilievre','Poilievre','Trudeau','Justin Trudeau', 'Liberal','Conservative','JeanCharest','JustinTrudeau','Scheer']
    key_themes['Business'].extend(["Telus","Bell",'Quebecor','Shaw','Rogers',"Ericsson","Samsung","Nokia"])
    key_themes['Allies'].remove('CA')
    key_themes['Allies'].remove('Canada')
    #key_themes['Allies'].remove('Meng Wanzhou')
    key_themes['Allies'].remove('Trudeau')

if COUNTRY2 == "AU":
    key_themes['Politics'] = ['politics', 'leadership','Labour','Liberal','Conservative','MPs','sponsor','travel','overseas travel','Scott Morrison']
    key_themes['Business'].extend(["Telstra","Optus",'TPG','VHA',"Ericsson","Samsung","Nokia"])
    key_themes['Allies'].remove('Australia')
    key_themes['Allies'].remove('AU')
    key_themes['Allies'].remove('Morrison')

if COUNTRY2 == "US":
    key_themes['Politics'] = ['politics', 'leadership','Liberal','Conservative','GOP','Democrat','Republican','Democratic Party','Trump','Donald Trump','DonaldTrump']
    key_themes['Business'].extend(["Verizon","T-Mobile",'AT&T',"Ericsson","Samsung","Nokia"])
    key_themes['Allies'].remove('USA')
    key_themes['Allies'].remove('US')
    key_themes['Allies'].remove('United States')
    key_themes['Allies'].remove('Trump')

if COUNTRY2 == "NZ":
    key_themes['Politics'] = ['politics','leadership','Labour','National','Greens','Maori Party','ACT','Jacinda Arden','PM','Arden','Hipkins','NZ First']
    key_themes['Business'].extend(["Spark","Vodaphone",'2degrees',"Ericsson","Samsung","Nokia"])
    key_themes['Allies'].remove('New Zealand')
    key_themes['Allies'].remove('NZ')
    key_themes['Allies'].remove('Arden')

if COUNTRY2 == "UK":
    key_themes['Politics'] = ['politics', 'leadership','Labour','Liberal','Conservative','MPs','BorisJohnson','Boris Johnson','rebellion','Tory']
    key_themes['Business'].extend(["BT","Virgin",'Sky','Vodaphone',"Ericsson","Samsung","Nokia"])
    key_themes['Allies'].remove('UK')
    key_themes['Allies'].remove('United Kingdom')
    key_themes['Allies'].remove('Boris Johnson')

# TEST - probably remove this key_themes
##key_themes = {}
# Canada - topics
# Huawei as security threat
##key_themes['Security Threat'] = ['5g','bell','canada','china','compensation','conservative','crimes','decision','enforcement','equipment','ericsson','fed','hack','justintrudeau','kovrig','leader','liberal','meng','pierrepoilievre','premier','putin','security''signal','spies','surveillance','taxpayers','telecom','telus','trudeau','uyghur','wanzhou']
# Huawei as a legitimate company. i.e. normal PR
##key_themes['Huawei PR'] = ['android','apple','autodesk','autonomous','battery','camera','cities','fastcharger','fitness','giveaway','headset','industrial','iot','laptop','latest','lens','market','matebook','matepad','mateview','monitor','neweggcan','notebook','oracle','patent','phone','prizes','projector','revenue','smartwatch','supported','tablet''tracker','wifi']

    
# resolver is a dictionary that takes any search term and looks up the general topic it belongs to
resolver = {} 
for key in key_themes.keys():
    vlst = key_themes.get( key, [] )
    for item in vlst:
        resolver[ item ] = key

# Make a single list of thematic words to search for
themes = []
for lst in key_themes.values():
    themes.extend( lst )

# Get embeddings for the themes
theme_embeddings = model.encode(themes)
Max Sequence Length: 256
In [ ]:
 

Read the topic results and lookup each tweet to determine the themes it touches on¶

In [6]:
if LOAD_FRESH:
    # Open main file
    if TOPIC_MODEL:
        df = pd.read_csv(open(f"../topics/topics_{COUNTRY2}.csv",'r')) # For topic data
    else:
        df = pd.read_csv(open(f"../location/full_{COUNTRY2}.csv",'r')) # For full timeline
        #pass # I don't want to load this by accident -- this takes forever to run!
        
    df['text']    = df['cleaned_text'].apply( clean_txt ) # Clean text
    sentences_all = df['cleaned_text'].values
    sentences     = df['text'].values
    print( sentences[0] ) # Display first sentence for sanity check

    # Get embeddings for the sentences
    print("Getting sentence embeddings...")
    df['sentence_embedding'] = df['text'].apply( get_embeddings )
    print("Get sentence embeddings... done")
In [8]:
def infer_central_phrase(embeddings, tokenizer, sentences):
    # Compute pairwise cosine similarities between sentence embeddings
    similarity_matrix = cosine_similarity(embeddings)

    # Compute average cosine similarity to all other sentence embeddings for each sentence embedding
    avg_similarities = np.mean(similarity_matrix, axis=1)

    # Find the sentence embedding with the highest average cosine similarity
    central_index = np.argmax(avg_similarities)
    central_embedding = embeddings[central_index]

    # Convert central_embedding to token IDs
    central_ids = np.argmax(central_embedding, axis=1).tolist()

    # Convert token IDs to tokens and then decode into text
    central_tokens = tokenizer.convert_ids_to_tokens(central_ids)
    central_phrase = tokenizer.convert_tokens_to_string(central_tokens)

    # Retrieve corresponding sentence from original sentences array
    central_sentence = sentences[central_index]

    return central_phrase, central_sentence


#infer_central_phrase()

Write out results to a temp file¶

In [9]:
if LOAD_FRESH: 
    # Use the resolver dict made above to get the number of themes in a sentence
    df['results'] = df.apply( lambda x: get_themes( x.text, x.sentence_embedding ), axis=1 )
    for t in key_themes.keys():
        df[t] = df.apply(lambda x: mycounter(x.results,t,key_themes.keys()), axis=1 )

    #df.drop( 'results', axis=1, inplace=True )
    #df.drop( 'sentence_embedding', axis=1, inplace=True )
    if TOPIC_MODEL:
        print("Writing topic data")
        df.to_csv(open(f"out_topic_{COUNTRY2}.csv",'w',newline='\n')) # Write results out to file for later processing
    else:
        df.to_csv(open(f"out_full_{COUNTRY2}.csv",'w',newline='\n')) # Write results out to file for later processing
        

Read the temporary file and prepare for the Excel pivot visualization¶

In [10]:
##COUNTRY2 = "UK"

key_themes_lst = ['Business','Politics','Security','Technology','Allies','Huawei PR'] # Main themes defined

if not LOAD_FRESH:
    if TOPIC_MODEL:
        print("Reading topic model cached data")
        #df = pd.read_csv(open(f"out_topic_{COUNTRY2}.csv",'r')) # Read cached data from topic model run
        df = pd.read_csv(open(f"topics_{COUNTRY2}.csv",'r')) 
        key_themes_lst.append( 'topic' ) # Some additional columns to keep
    else:
        print("Reading full time cached data")
        df = pd.read_csv(open(f"out_full_{COUNTRY2}.csv",'r')) # READ cached file!


key_themes_lst.append( 'name' )
key_themes_lst.append( 'date' )

# Read the output file
df = df[ key_themes_lst ] # just keep the key_theme columns (and extras)
# Convert date column to datetime object. Set it to the start of the week period.
df["week"] = pd.to_datetime(df["date"]).dt.to_period('W').dt.start_time
df["month"] = pd.to_datetime(df["date"]).dt.to_period('M').dt.start_time


# Write the results to be used in an Excel template for visualization
df.to_csv(open(f'piv_{COUNTRY2}.csv','w',newline='\n'))
Reading full time cached data
In [11]:
df.columns
Out[11]:
Index(['Business', 'Politics', 'Security', 'Technology', 'Allies', 'Huawei PR',
       'name', 'date', 'week', 'month'],
      dtype='object')

Pivot dataframe and visualize¶

In [12]:
df = df.sort_values(by='month', ascending=True)
pf = pd.pivot_table( df, index=['month'] ,aggfunc=np.sum )
pf = pf.reset_index()

if 'topic' in pf.columns:
    pf.drop('topic', axis=1, inplace=True)

Visualization of timeline using stacked area plot¶

In [14]:
import matplotlib.pyplot as plt
#%matplotlib widget
%matplotlib inline

# CNet timeline https://www.cnet.com/news/privacy/huawei-ban-timeline-detained-cfo-makes-deal-with-us-justice-department/


ax = pf.plot.area(x='month', stacked=True)
#plt.stackplot(pf['month'], pf['Business'],pf['Security'],pf['Politics'],pf['Allies'],pf['Technology'],  labels=['Bus','B','C'])
ymin, ymax = plt.ylim() 
TEN_PERC = round( ymax * 0.10)
THREE_PERC = round( ymax * 0.03)
TOP_LABEL = ymax - TEN_PERC
MID_LABEL = round( ymax / 2 ) + TEN_PERC
LOW_LABEL = ymin + THREE_PERC
print( TOP_LABEL )

# Ban dates for all the five eyes
dd = {'2022-05-17':('CA',''),'2018-07-11':('AU',''),"2019-05-15":('US','r'),"2020-07-14":('UK','')}
for dt, i in dd.items():
    align_dir = "right" if i[1] != 'r' else 'left'
    ax.axvline( pd.to_datetime(dt), color='black', linestyle=':', lw=1, alpha=0.5) # Canada
    ax.annotate(f' {i[0]} \n ban ', xy=(dt, TOP_LABEL ) ,horizontalalignment=align_dir, fontsize=10 )

# Australia events
if COUNTRY2 in ['AU','CA','US','UK']: 
    ax.axvline( pd.to_datetime('2021-09-24'), color='black', linestyle=':', lw=1) # Canada 2Michaels returned
    ax.annotate(' Two\n Michaels\n Return', xy=('2021-09-24', MID_LABEL)   )
    ax.axvline( pd.to_datetime('2018-12-01'), color='white', linestyle='--', lw=1) # Canada Meng Wanzhou arrested
    ax.annotate(' Huawei\n CFO\n arrested', xy=('2018-12-01', LOW_LABEL), color='white', fontweight='bold'    )

# UK events
if COUNTRY2 in ['UK','CA','US','NZ']: 
    ax.axvline( pd.to_datetime('2020-01-28'), color='red', linestyle=':', lw=1 , alpha=0.5) # UK Allows Huawei to build network
    ax.annotate(' UK to \n allow \n Huawei ', xy=('2020-01-28', MID_LABEL), color='red', horizontalalignment='right'    )
    # Jan. 29, 2020  - Europe allows Huawei for 5G through security guidelines
    ax.axvline( pd.to_datetime('2020-01-29'), color='red', linestyle=':', lw=1, alpha=0.5) # Jan. 29, 2020  - Europe allows Huawei for 5G through security guidelines
    ax.annotate(' EU to \n allow \n Huawei ', xy=('2020-01-29', TOP_LABEL*.85), color='red' ,horizontalalignment='right'   )
    # Feb 11, 2020 - US finds Huawei has backdoor access to mobile networks globally, report says
    ax.axvline( pd.to_datetime('2020-02-11'), color='red', linestyle=':', lw=1, alpha=0.5) # Feb 11, 2020 - US finds Huawei has backdoor access to mobile networks globally, report says
    ax.annotate(' Huawei \n backdoors \n found ', xy=('2020-02-11', MID_LABEL), color='red'  )
    if COUNTRY2 != 'NZ':
        ax.axvline( pd.to_datetime('2020-03-10'), color='white', linestyle='--', lw=1) # UK MP rebellion
        ax.annotate(' UK\n MPs \n lose \n vote', xy=('2020-03-10', LOW_LABEL), color='white', fontweight='bold'   )

# US events    
if COUNTRY2 in ['US','CA','AU']:
    # Nov 22, 2019 - Huawei banned from US subsidies
    #ax.axvline( pd.to_datetime('2019-11-22'), color='red', linestyle=':', lw=1, alpha=0.5) # Jan. 29, 2020  - Europe allows Huawei for 5G through security guidelines
    #ax.annotate(' US \n subsidies \n ban ', xy=('2019-11-22', TOP_LABEL*.85), color='red' ,horizontalalignment='right'   )
    ax.fill_between(['2019-10-16', '2020-02-11'], [ymax,ymax], color='gray', alpha=0.2) # ratcheting up of tensions
    ax.axvline( pd.to_datetime('2020-12-04'), color='red', linestyle=':', lw=1) # US deal to release Meng?
    ax.annotate(' Trump \n alludes to \n release of \n Huawei \n CFO ', xy=('2020-12-04', MID_LABEL), color='red' , horizontalalignment='left'  )

# AU events    
if COUNTRY2 in ["AU"]:
    ax.axvline( pd.to_datetime('2018-08-27'), color='red', linestyle=':', lw=1) # Turnbull resigns
    ax.annotate(' Turnbull\n resigns ', xy=('2018-08-24', (TOP_LABEL*0.9)), color='red'   )
    ax.axvline( pd.to_datetime('2018-06-06'), color='red', linestyle=':', lw=1) # FB shared data with Huawei scandal
    ax.annotate(' FB \n shares \n data ', xy=('2018-06-06', MID_LABEL), color='red', horizontalalignment='right'   )
    # UK announces to allow Huawei into network before backpedalling
    ax.axvline( pd.to_datetime('2020-02-28'), color='red', linestyle=':', lw=1) # UK Allows Huawei to build network
    ax.annotate(' UK \n allows \n Huawei ', xy=('2020-02-28', MID_LABEL), color='red' , horizontalalignment='right'  )

# NZ events    
if COUNTRY2 == "NZ":
    # https://www.sparknz.co.nz/news/GCSB_declines_Spark_proposal_Huawei/
    ax.axvline( pd.to_datetime('2018-11-28'), color='white', linestyle='--', lw=1, alpha=1) # GCSB blocks use of Huawei by Spark
    ax.annotate(' GCSB \n block ', xy=('2018-11-28', LOW_LABEL*.5), color='white', horizontalalignment='right',fontsize=10, weight='bold'  )
    # https://www.stuff.co.nz/business/112486350/huawei-nz-we-know-the-rumours-gossip-and-innuendo-about-us?rm=m
    ax.axvline( pd.to_datetime('2019-05-06'), color='white', linestyle='--', lw=1) # Huawei PR
    ax.annotate(' Huawei \n PR ', xy=('2019-05-06', 40), color='white', weight='bold'   )
    # Reports of NZ tensions with China
    ax.axvline( pd.to_datetime('2019-02-12'), color='white', linestyle='--', lw=1) # 
    ax.annotate(' NZ-China  \n tension  ', 
                bbox=dict(boxstyle="round", fc="1"), # foreground color?
                xytext=('2019-01-01', MID_LABEL*1.25), 
                color='black', horizontalalignment='right', 
                xy=('2019-02-12', 400), 
                arrowprops=dict(
                arrowstyle="->",
                            #connectionstyle="bar",
                    connectionstyle="arc",
                            #ec="k",
                            shrinkA=0, shrinkB=5
                ) 
               )

plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.title(f'Themes about Huawei discussed over time ({COUNTRY2})')
plt.xlabel('Date')
plt.ylabel('Theme discussed')


import matplotlib
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(7.5, 5)

plt.savefig(f'stacked_area_{COUNTRY2}.pdf', bbox_inches='tight')  # saves the current figure
plt.show()
10500.599999999999
In [ ]:
 
In [ ]:
set( df.topic )
In [14]:
# This only works when the data has topics from the BERTopic model
if TOPIC_MODEL:
    print( "Running!" )
    # Now lets see overlap between categories and 
    key_themes_lst = ['Allies','Business','Politics','Security','Technology']
    df = df.sort_values(by='month', ascending=True)
    sdf = pd.pivot_table( df, index=['topic','name'] ,aggfunc=np.sum )
    sdf['row_sum'] = sdf[key_themes_lst].sum(axis=1)
    #sdf.drop('index', axis=1, inplace=True)
    #sdf.drop('topic', axis=1, inplace=True)
    sdf.drop('row_sum', axis=1, inplace=True)
    sdf[key_themes_lst] = sdf[key_themes_lst].div(sdf[key_themes_lst].sum(axis=1), axis=0) # normalize

    sdf[key_themes_lst] = sdf[key_themes_lst].round(decimals=2
                                                )
    #sdf.style.set_precision(2).background_gradient(cmap='RdYlGn',axis=1,vmin=0, vmax=1)
    sdf_styled = sdf.style.set_precision(2).background_gradient(cmap='Blues',axis=1,vmin=0, vmax=1)
    sdf_styled

    # Write out the styled table to disk
    #!python -m pip install dataframe_image
    import dataframe_image as dfi
    dfi.export(sdf_styled, f'{COUNTRY2}_theme_table_styled.png')
In [16]:
sdf_styled
Out[16]:
False