+ indep. WoS citations

Python and Networks // 2017-05-09 // Agglomerative hierarchical clustering

Problem

Average-linkage based agglomerative hierarchical clustering of the characters of a file. The similarity of two characters is defined as the number of times they appear next to each other.

Solution (example)

1.  Python code (clus.py)

import sys
import numpy as np

# ===== read arguments from the command line =====

_thisFile, _outFileNw, _outFileClus = sys.argv

# ===== function definitions =====

# compute cp2n{charA charB} (character pair to number),
# which is how many times the characters charA and charB
# appear next to each other in the file "inFile"
# - note: self-links are ignored

def comp_cp2n_noSelf(inFile,cp2n):

    # open the input file for reading
    with open(inFile,"r") as f:
        # read the entire input file into a single string variable
        data = f.read()
        ### test
        #data = data[0:10]

        # loop through the list of adjacent character pairs:
        for i in range(len(data)-1):
            # the current character pair is data[i],data[i+1]
            # proceed only if these two characters are different
            if data[i] != data[1+i]:
                # set the current character pair
                charPair = ''.join(sorted(data[i:i+2]))
                # IF we have not seen the current character pair yet,
                if charPair not in cp2n:
                    # THEN declare the weight of this pair as zero
                    cp2n[charPair] = 0
                # in all cases: add 1 to the weight of this char pair
                cp2n[charPair] += 1

    ### test # print(cp2n)

# ---------------------

# IF the character "c" is a number or a letter,
# THEN return it unchanged,
# ELTE return a "*" followed by the ASCII code of character "c"
def charConv(c):

    # ca (ca = character ASCII) the ASCII code of character "c"
    ca = ord(c)
    # IF the character "c" is a number, an uppercase or lowercase latin letter,
    if 48 <= ca and ca <= 57 or 65 <= ca and ca <= 90 or 97 <= ca and ca <= 122:
        # THEN return this character
        return c
    # ELSE: return the ASCII code of the character with a "*" prepended to it
    else:
        return "*"+str(ca)

# ---------------------

# write the undirected weighted network
# in nodePair2weight to the file outFileNw

def write_undirWeightedNetwork(nodePair2weight,outFile):

    # --- count nodes ---
    # declare the set of nodes
    nodeSet = set()
    # loop over the list of all links
    for nodePair in nodePair2weight:
        # take the two node names (the two end points of this link)
        nodeA, nodeB = list(nodePair)
        # save both nodes to the set of node names
        for node in nodeA, nodeB:
            nodeSet.add(node)

    # --- print output ---
    # open outfile
    with open(outFile,"w") as f:
        # print file header
        f.write("# Number of nodes: %d\n" % len(nodeSet))
        f.write("# Number of undirected links: %d\n" % len(nodePair2weight))
        f.write("# Note: characters that are neither numbers nor letters\n")
        f.write("#       are printed as their ASCII codes after a \"*\"\n")
        f.write("# -----------\n")
        f.write("# Name of node A (a character or its ASCII code after a \"*\")\n")
        f.write("#\tName of node B (another character)\n")
        f.write("#\t\tNumber of times we see these two characters next to each other\n")
        f.write("\n")
        # print data: links sorted in the descending order of their weights
        for nodePair in sorted( nodePair2weight, key=nodePair2weight.get, reverse=True ):
            # extract the two node names and extract the weight of the link
            nodeA, nodeB = list(nodePair)
            # convert both node names to make sure that only printable characters are printed
            nodeAconv = charConv(nodeA)
            nodeBconv = charConv(nodeB)
            # print the data line: node names and link weight
            f.write("%s\t%s\t%g\n" % (nodeAconv,nodeBconv,nodePair2weight[nodePair]))
        # close the output file
        f.close()

# ------------------------------

# the n2n2w[node1] book-keeping contains the strengths of the connections of other nodes to node1
# add "add_number" to the strength of the connection of node2 as a neighbor of node1

def n2n2w_add(n2n2w,node1,node2,add_number):

    # IF node1 has no neighbors yet,
    if node1 not in n2n2w:
        # THEN declare its neighbors as an empty dict
        n2n2w[node1] = {}

    # IF node2 has not yet been saved as a neighbor of node1,
    if node2 not in n2n2w[node1]:
        # THEN declare their connection weight as zero
        n2n2w[node1][node2] = 0

    # in all cases: add "add_int_value" to the link strength
    n2n2w[node1][node2] += add_number

# ------------------------------

# copy data structure: key1 -> value list

def copy_k2vL_struct(k2vL_from,k2vL_to):

    # loop over the list of keys
    for key in k2vL_from:
        # declare list for this key
        k2vL_to[key] = []
        # for the current key:
        # copy all values
        for value in k2vL_from[key]:
            k2vL_to[key].append(value)

# ------------------------------

# copy data structure: key1->key2->value

def copy_k2k2v_struct(k2k2v_from,k2k2v_to):

    # loop over the list of primary keys
    for key1 in k2k2v_from:
        # declare dict for the primary key
        k2k2v_to[key1] = {}
        # for the current primary key:
        # loop over the list of secondary keys 
        for key2 in k2k2v_from[key1]:
            # set key2-value
            k2k2v_to[key1][key2] = k2k2v_from[key1][key2]

# ------------------------------

# return the two keys for which the value is maximal

def matrixMaxValueAndPosition(k2k2v):

    # declare variables and set their default values: key1ofMax,key2ofMax,maxValue
    my_list = list(k2k2v.keys())
    key1ofMax = my_list[0]
    my_list = list(k2k2v[key1ofMax].keys())
    key2ofMax = my_list[0]
    maxValue = k2k2v[key1ofMax][key2ofMax]
    ### test
    #print("%g %s %s" % (maxValue,charConv(key1ofMax),charConv(key2ofMax))); sys.exit()
    # loop over the list of primary keys
    for key1 in k2k2v:
        # loop over the secondary keys for this primary key
        for key2 in k2k2v[key1]:
            # IF the current value is higher than the
            # value that we found so far to be the largest,
            if k2k2v[key1][key2] > maxValue:
                # THEN set the highest value
                maxValue = k2k2v[key1][key2]
                # AND set its indices
                key1ofMax = key1
                key2ofMax = key2

    # return results
    return maxValue, key1ofMax, key2ofMax

# ------------------------------

# agglomerative hierarchical clustering
# of the weighted network of links stored
# in nodePair2weight
#
# at each step, write the current clusters to the output file

def agglom_hier_clus(nodePair2weight,outFileClus):

    # --- convert the data structure storing the network ---

    # assemble n2n2w, where n2n2w{nodeA}{nodeB} is
    # is the weight of the nodeA-nodeB link
    # - note: in n2n2w all links are saved twice
    n2n2w = {}

    # loop over the list of links
    for nodePair in nodePair2weight:
        # the two nodes
        nodeA, nodeB = list(nodePair)
        # save nodeB as a neighbor of nodeA with the given weight
        n2n2w_add(n2n2w,nodeA,nodeB,nodePair2weight[nodePair])
        # save nodeA with a weight for nodeB
        n2n2w_add(n2n2w,nodeB,nodeA,nodePair2weight[nodePair])

    # --- declare data structures for clustering ---

    # each node belongs to a separate cluster that has the index of the node
    # n2c[n] (node to cluster) is the cluster index of node <n>
    n2c = { _:_ for _ in n2n2w }
    # each cluster contains a single node that has the index of the cluster
    # c2nL[c] (cluster to node list) is the list of nodes in cluster <c>
    c2nL = { _:[_] for _ in n2c }
    # c2c2as[c1][c2] (cluster to cluster to average similarity):
    # the average similarity between the items of clusters c1 and c2
    # initially this data structure is the same as the weighted network
    c2c2as = {}; copy_k2k2v_struct(n2n2w,c2c2as)
    ### test # print(c2c2as)
    # t2cL[t][c] after time step t this is the list of items in cluster c
    t2cL = []
    # at t=0 the list of clusters is c2nL: save c2nL as the last item of the t2cL list
    t2cL.append({}); copy_k2vL_struct(c2nL,t2cL[-1])

    # --- agglomerative hierarchical clustering ---

    # exit condition: all nodes are in the same cluster
    while 1 < len(c2nL):
        ### test: print the number of clusters
        #print("%d clusters" % len(c2nL))
        # find the two clusters with the highest similarity
        maxValue, cA, cB = matrixMaxValueAndPosition(c2c2as)
        ### test
        #print("%g %s %s" % (maxValue,charConv(cA),charConv(cB)))
        #print(c2c2as)

        # merge these two clusters: change the book-keeping data structures
        # - for each node of cluster cB:
        for node in c2nL[cB]:
            # set its cluster index to cA
            n2c[node] = cA
            # append it to the node list of cluster cA
            c2nL[cA].append(node)
        # - remove cluster cB from the cluster -> node list book-keeping
        del c2nL[cB]
        ### test
        #print(c2nL)

        # remove the cluster similarity values of the removed cluster cB
        # with all other clusters
        # - NOTE that each cluster similarity value is saved in both directions
        # (1) remove all cOther (other cluster) -> cB similarity values
        for cOther in c2c2as[cB]:
            del c2c2as[cOther][cB]
        # (2) remove similarity values this way as well: cB -> all other clusters
        del c2c2as[cB]

        # re-compute the cluster similarity of cluster cA with all remaining clusters
        # based on the n2n2w node-node similarity values
        comp_selClusSim(n2n2w,n2c,c2nL,cA,c2c2as)

        # save the current cluster list
        t2cL.append({}); copy_k2vL_struct(c2nL,t2cL[-1])

    # --- print the clusters for each step ---
    # open the output file
    with open(outFileClus,"w") as f:
        # print file header
        f.write("# Clusters during the successive steps of average-similarity based agglomerative hierarchical clustering\n")
        #f.write("# Step number (0 means: before the start)\n#\tList of items in this cluster\n")
        f.write("# List of items in each cluster\n")
        f.write("# - clusters are separated with a single space character\n")
        f.write("# - whitespace characters are escaped, e.g., the \'newline\' character is written as \\n \n")
        f.write("# - clusters are written in the descending order of their sizes\n")
        f.write("\n")
        # print data: for each time step
        for t in range(len(t2cL)):
            # cluster index -> cluster size mapping, c2nn (cluster index to cluster size)
            c2s = {}
            for clus in (t2cL[t].keys()):
                c2s[clus] = len(t2cL[t][clus])
            # write the time step
            #f.write("%d\t" % t)
            # sorted(t2cL[t].keys()) is the sorted list of clusters at time t
            # for each of these clusters: write the nodes of the cluster
            # note: whitespace characters are replaced by their "escaped" versions
            # note: clusters are listed in the descending order of their sizes
            f.write( ' '.join( ''.join( sorted(list_escSpac(t2cL[t][clus])) ) for clus in sorted( c2s, key=c2s.get, reverse=True ) ) )
            # close the current data line
            f.write("\n")

# ------------------------------

# in the list of characters "char_list_in"
# replace \ , \t, \n, \f, \r characters by their escaped versions

def list_escSpac(char_list_in):

    # define the mapping
    char_map = { ' ':r'\ ', '\t':r'\t', '\n':r'\n', '\r':r'\r', '\f':r'\f' }

    # declare the output list
    char_list_out = []

    # loop over the list of indexes in this list
    for i in range(len(char_list_in)):
        # IF the current character is a whitespace character,
        if char_list_in[i] in list(char_map.keys()):
            # THEN replace it with its escaped version
            char_list_out.append(char_map[char_list_in[i]])
        else:
            char_list_out.append(char_list_in[i])

    # return the output value
    return char_list_out

# ------------------------------

# n2n2w[nA][nB] is the similarity of nodes nA and nB
# c2c2as[cA][cB] is the simimlarity of clusters cA and cB
# n2c[nA]: is the cluster index of node nA
# c2nL[cA]: is the list of nodes in cluster cA
#
# the cluster-cluster similarity of clusters cA and cB will be the average
# of node-node similarity values of the nodes nA and nB, for which
# nA is in cA, and nB is in cB

def comp_selClusSim(n2n2w,n2c,c2nL,c_sel,c2c2as):

    # loop over the list of clusters other than the selected cluster
    for c_other in c2nL:
        if c_other != c_sel:
            # IF either of the two clusters has no similarity value
            # to any other cluster,
            for c_now in c_other, c_sel:
                if c_now not in c2c2as:
                    # THEN declare the dict of its similarity values to other clusters
                    c2c2as[c_now] = {}
            # declare the list of nA-nB node-node similarities
            # where nA is in c_sel and nB is in c_other
            sim_value_list = []
            # loop over the list of nodes in the selected cluster
            for nA in c2nL[c_sel]:
                # loop over the list of nodes in the other cluster
                for nB in c2nL[c_other]:
                    ### test
                    #print("%s %s" % (nA,nB))
                    # IF these two nodes do have a similarity index,
                    if nB in n2n2w[nA]:
                        # THEN add the similarity of these two nodes
                        # to the similarity of their clusters
                        sim_value_list.append(n2n2w[nA][nB])
                    # ELSE
                    else:
                        # add zero to the list
                        sim_value_list.append(0)
            # compute the average of the node-node similarity values
            # and save this average as the cluster-cluster similarity
            c2c2as[c_sel][c_other] = np.mean(sim_value_list)
            # save this value the other way, too
            c2c2as[c_other][c_sel] = c2c2as[c_sel][c_other]

# ===== main =====

# explain distance/similarity-based iterative clustering
# group distance metric is calculated from pair distances:
# min, max, avg, geom.mean

# construct the frequency-weighted network of the
# character pair co-occurrences of the current file
#
# cp2n{charA charB} (character pair to number),
# is how many times the characters charA and charB
# appear next to each other in the current file
# - note: self-links are ignored
_cp2n = {}; comp_cp2n_noSelf(_thisFile,_cp2n)

# print network for visualization
write_undirWeightedNetwork(_cp2n,_outFileNw)

# agglomerative hierarchical clustering
# of the weighted network of character neighborships
agglom_hier_clus(_cp2n,_outFileClus)

2.  How to use the code

python3 clus.py charNw.txt clusList.txt

3.  Output files

Weighted network of characters: charNw.txt

List of clusters after each step: clusList.txt