import pandas as pd
import numpy as np
import networkx as nx
import csv

# ----------------------
# Config
# ----------------------
#COUNTRY_LIST = ['CAN', 'CN1', 'USA', 'IND','DEU','JPN','KOR','HKG','NLD','ITA','BEL','FRA','RUS','GBR','MEX','ESP','AUS','NZL','BRA']
COUNTRY_LIST = ['NZL', 'CN1' ]
YEAR="2018"
IGNORE_THRESHOLD = 100 # ignore edges below this

COUNTRY = "_".join(COUNTRY_LIST)

# ----------------------
# Load ISIC data -- use to get human readable industry names
# ----------------------
isic={}
r = csv.reader(open("ISIC_Rev_4_english_structure.Txt",'r')) 
for row in r:
    isic[row[0]] = row[1] 
    try:
        isic[int(row[0])] = row[1] 
    except:
        pass


# ----------------------
# Read the OECD input output tables. You can download from 
# https://www.oecd.org/sti/ind/inter-country-input-output-tables.htm
# ----------------------
df = pd.read_csv(open(f'ICIO2021_{YEAR}.csv','r'))

cols = list( df.columns )
cols[0] = 'index' # first column is blank so call it 'index'
df.columns = cols

# ----------------------
# Helper functions
# ----------------------
def is_int(txt):
    """
    Determine whether the text is an integer or not
    """
    try:
        int(txt)
        return True
    except:
        pass
    return False

def keep_name(txt):
    """
    OECD IO Tables have country and industry code in the name. ex: AUS_07T08
    Other fields are summaries of industries (like AUS_TAXAMT etc.) we're not interested in those.
    Parse the name to ensure that there is an industry code. If there's a 'T' ensure that there are two codes.
    """
    if 'index' in txt:
        return True
    found_match = False
    for c in COUNTRY_LIST:
        if txt.startswith(c): # We are filtering by a single country! 
           found_match = True
    if not found_match:
        return False
    items = txt.split('_')
    end = items[1]
    if 'T' in end: # Check that this represents industry code (always an int) and not a word like TAX
        split_ends = end.split('T')
        res = ( is_int( split_ends[1] ) & ( is_int(split_ends[0]) ) )
        #print(f"   {split_ends} = {res}" ) # testing!
        return res
    if is_int( end ):    
        return True # keep
    return False

def get_isic(txt):
    """
    Get the relevat ISIC human readable industry. If there are multiple industries, then combine them
    """
    found_match = False
    nation = ''
    for c in COUNTRY_LIST: # Make sure we're in a country-industry column
        if c in txt:
            found_match = True
            if len(COUNTRY_LIST) > 1:
                nation = c # If there are multiple countries, set to nation name
    if not found_match:
        return ''

    items = txt.split('_')
    end = items[1]
    if end in isic:
        return cln(isic[end],nation)
    if 'T' in end:
        split_ends = end.split('T')
        txt = ''
        for s in split_ends:
            if s in isic:
                if len(txt) > 0:
                    txt = f'{txt};{cln(isic[s])}'
                else:
                    txt = cln(isic[s],nation)
        return txt

def cln(txt,nation=''):
    """
    Cleanup text fields from ISIC -- remove verbose phrases and shorten text returned to 20 characters
    """
    SHORTEN_LEN=10
    phrases = ['Mining of','Manufacture of','Extraction of','Activities of','Construction of']
    for p in phrases:
        txt = txt.replace( p, '' ).strip()
    if nation != '':
        nation = f'{nation}:'
    return f'{nation}{txt[0:SHORTEN_LEN].capitalize()}'


# ----------------------
# Main
# ----------------------


# ----------------------
# Filter data -- The data is an adjacency matrix. Make it long! 
# ----------------------
# i.e. a matrix of trade flows from one country's industry to another domestic or foreign industry.
# I'm just interested in domestic trade flows from one industry to another.
filter_lst = [i for i in list(df.columns) if keep_name(i) ] # Filter column names to the COUNTRY of interest and 'index'
print( filter_lst )
df = df[ filter_lst ] # Remove the columns that aren't in the list
df = df[ df['index'].isin( filter_lst ) ] # Filter the row names (which are the same as the column names) based on the filter list

print( df.head() ) # Peak at the results

df.set_index("index",inplace=True)  # first column is filled with strings (i.e. the filtered row names)
                                    # We make this into an index so we can operate on the numeric contents of the table cells


# ----------------------
# Convert the adjacency matric into an edge list -- this is smaller and the sort of data Power BI expects to see
# ----------------------
v = df.rename_axis('source').reset_index() # rename index to source node
df = pd.melt( # Go from wide to long
      v,
      id_vars='source',
      value_name='weight', # the numeric value of trade flows between two industries
      var_name='target' # the other industry
)#.query('Source != Target')\ # Use to drop self-loops. Keep them for now!
 #.reset_index(drop=True)


#df['isic'] = df.source.apply( lambda x: get_isic(x))
df = df[df.weight > IGNORE_THRESHOLD]  # This is for visualization purposes. Remove the small economic ties.

print( df.head() ) # Review the long results
print( df.size )

# ----------------------
# Build the network! Can visualize this file using Gephi
# ----------------------
g = nx.from_pandas_edgelist(df, source="source", target="target", edge_attr='weight')
for n in g.nodes:
    g.nodes[n]['label'] = get_isic(n)
nx.write_gexf(g,f"{COUNTRY}_ICIO_{YEAR}.gexf")

# ----------------------
# Save the edge list to read into other packages like Power BI
# ----------------------
df['source'] = df.source.apply(lambda x:get_isic(x) )
df['target'] = df.target.apply(lambda x:get_isic(x) )
df.to_csv(f"{COUNTRY}_edges_{YEAR}.csv")