import pandas as pd import numpy as np import networkx as nx import csv # ---------------------- # Config # ---------------------- #COUNTRY_LIST = ['CAN', 'CN1', 'USA', 'IND','DEU','JPN','KOR','HKG','NLD','ITA','BEL','FRA','RUS','GBR','MEX','ESP','AUS','NZL','BRA'] COUNTRY_LIST = ['NZL', 'CN1' ] YEAR="2018" IGNORE_THRESHOLD = 100 # ignore edges below this COUNTRY = "_".join(COUNTRY_LIST) # ---------------------- # Load ISIC data -- use to get human readable industry names # ---------------------- isic={} r = csv.reader(open("ISIC_Rev_4_english_structure.Txt",'r')) for row in r: isic[row[0]] = row[1] try: isic[int(row[0])] = row[1] except: pass # ---------------------- # Read the OECD input output tables. You can download from # https://www.oecd.org/sti/ind/inter-country-input-output-tables.htm # ---------------------- df = pd.read_csv(open(f'ICIO2021_{YEAR}.csv','r')) cols = list( df.columns ) cols[0] = 'index' # first column is blank so call it 'index' df.columns = cols # ---------------------- # Helper functions # ---------------------- def is_int(txt): """ Determine whether the text is an integer or not """ try: int(txt) return True except: pass return False def keep_name(txt): """ OECD IO Tables have country and industry code in the name. ex: AUS_07T08 Other fields are summaries of industries (like AUS_TAXAMT etc.) we're not interested in those. Parse the name to ensure that there is an industry code. If there's a 'T' ensure that there are two codes. """ if 'index' in txt: return True found_match = False for c in COUNTRY_LIST: if txt.startswith(c): # We are filtering by a single country! found_match = True if not found_match: return False items = txt.split('_') end = items[1] if 'T' in end: # Check that this represents industry code (always an int) and not a word like TAX split_ends = end.split('T') res = ( is_int( split_ends[1] ) & ( is_int(split_ends[0]) ) ) #print(f" {split_ends} = {res}" ) # testing! return res if is_int( end ): return True # keep return False def get_isic(txt): """ Get the relevat ISIC human readable industry. If there are multiple industries, then combine them """ found_match = False nation = '' for c in COUNTRY_LIST: # Make sure we're in a country-industry column if c in txt: found_match = True if len(COUNTRY_LIST) > 1: nation = c # If there are multiple countries, set to nation name if not found_match: return '' items = txt.split('_') end = items[1] if end in isic: return cln(isic[end],nation) if 'T' in end: split_ends = end.split('T') txt = '' for s in split_ends: if s in isic: if len(txt) > 0: txt = f'{txt};{cln(isic[s])}' else: txt = cln(isic[s],nation) return txt def cln(txt,nation=''): """ Cleanup text fields from ISIC -- remove verbose phrases and shorten text returned to 20 characters """ SHORTEN_LEN=10 phrases = ['Mining of','Manufacture of','Extraction of','Activities of','Construction of'] for p in phrases: txt = txt.replace( p, '' ).strip() if nation != '': nation = f'{nation}:' return f'{nation}{txt[0:SHORTEN_LEN].capitalize()}' # ---------------------- # Main # ---------------------- # ---------------------- # Filter data -- The data is an adjacency matrix. Make it long! # ---------------------- # i.e. a matrix of trade flows from one country's industry to another domestic or foreign industry. # I'm just interested in domestic trade flows from one industry to another. filter_lst = [i for i in list(df.columns) if keep_name(i) ] # Filter column names to the COUNTRY of interest and 'index' print( filter_lst ) df = df[ filter_lst ] # Remove the columns that aren't in the list df = df[ df['index'].isin( filter_lst ) ] # Filter the row names (which are the same as the column names) based on the filter list print( df.head() ) # Peak at the results df.set_index("index",inplace=True) # first column is filled with strings (i.e. the filtered row names) # We make this into an index so we can operate on the numeric contents of the table cells # ---------------------- # Convert the adjacency matric into an edge list -- this is smaller and the sort of data Power BI expects to see # ---------------------- v = df.rename_axis('source').reset_index() # rename index to source node df = pd.melt( # Go from wide to long v, id_vars='source', value_name='weight', # the numeric value of trade flows between two industries var_name='target' # the other industry )#.query('Source != Target')\ # Use to drop self-loops. Keep them for now! #.reset_index(drop=True) #df['isic'] = df.source.apply( lambda x: get_isic(x)) df = df[df.weight > IGNORE_THRESHOLD] # This is for visualization purposes. Remove the small economic ties. print( df.head() ) # Review the long results print( df.size ) # ---------------------- # Build the network! Can visualize this file using Gephi # ---------------------- g = nx.from_pandas_edgelist(df, source="source", target="target", edge_attr='weight') for n in g.nodes: g.nodes[n]['label'] = get_isic(n) nx.write_gexf(g,f"{COUNTRY}_ICIO_{YEAR}.gexf") # ---------------------- # Save the edge list to read into other packages like Power BI # ---------------------- df['source'] = df.source.apply(lambda x:get_isic(x) ) df['target'] = df.target.apply(lambda x:get_isic(x) ) df.to_csv(f"{COUNTRY}_edges_{YEAR}.csv")