Python and Networks // 2017-05-09 // Agglomerative hierarchical clustering
Problem
Average-linkage based agglomerative hierarchical clustering of the characters of a file. The similarity of two characters is defined as the number of times they appear next to each other.
Solution (example)
1. Python code (clus.py)
import sys
import numpy as np
# ===== read arguments from the command line =====
_thisFile, _outFileNw, _outFileClus = sys.argv
# ===== function definitions =====
# compute cp2n{charA charB} (character pair to number),
# which is how many times the characters charA and charB
# appear next to each other in the file "inFile"
# - note: self-links are ignored
def comp_cp2n_noSelf(inFile,cp2n):
# open the input file for reading
with open(inFile,"r") as f:
# read the entire input file into a single string variable
data = f.read()
### test
#data = data[0:10]
# loop through the list of adjacent character pairs:
for i in range(len(data)-1):
# the current character pair is data[i],data[i+1]
# proceed only if these two characters are different
if data[i] != data[1+i]:
# set the current character pair
charPair = ''.join(sorted(data[i:i+2]))
# IF we have not seen the current character pair yet,
if charPair not in cp2n:
# THEN declare the weight of this pair as zero
cp2n[charPair] = 0
# in all cases: add 1 to the weight of this char pair
cp2n[charPair] += 1
### test # print(cp2n)
# ---------------------
# IF the character "c" is a number or a letter,
# THEN return it unchanged,
# ELTE return a "*" followed by the ASCII code of character "c"
def charConv(c):
# ca (ca = character ASCII) the ASCII code of character "c"
ca = ord(c)
# IF the character "c" is a number, an uppercase or lowercase latin letter,
if 48 <= ca and ca <= 57 or 65 <= ca and ca <= 90 or 97 <= ca and ca <= 122:
# THEN return this character
return c
# ELSE: return the ASCII code of the character with a "*" prepended to it
else:
return "*"+str(ca)
# ---------------------
# write the undirected weighted network
# in nodePair2weight to the file outFileNw
def write_undirWeightedNetwork(nodePair2weight,outFile):
# --- count nodes ---
# declare the set of nodes
nodeSet = set()
# loop over the list of all links
for nodePair in nodePair2weight:
# take the two node names (the two end points of this link)
nodeA, nodeB = list(nodePair)
# save both nodes to the set of node names
for node in nodeA, nodeB:
nodeSet.add(node)
# --- print output ---
# open outfile
with open(outFile,"w") as f:
# print file header
f.write("# Number of nodes: %d\n" % len(nodeSet))
f.write("# Number of undirected links: %d\n" % len(nodePair2weight))
f.write("# Note: characters that are neither numbers nor letters\n")
f.write("# are printed as their ASCII codes after a \"*\"\n")
f.write("# -----------\n")
f.write("# Name of node A (a character or its ASCII code after a \"*\")\n")
f.write("#\tName of node B (another character)\n")
f.write("#\t\tNumber of times we see these two characters next to each other\n")
f.write("\n")
# print data: links sorted in the descending order of their weights
for nodePair in sorted( nodePair2weight, key=nodePair2weight.get, reverse=True ):
# extract the two node names and extract the weight of the link
nodeA, nodeB = list(nodePair)
# convert both node names to make sure that only printable characters are printed
nodeAconv = charConv(nodeA)
nodeBconv = charConv(nodeB)
# print the data line: node names and link weight
f.write("%s\t%s\t%g\n" % (nodeAconv,nodeBconv,nodePair2weight[nodePair]))
# close the output file
f.close()
# ------------------------------
# the n2n2w[node1] book-keeping contains the strengths of the connections of other nodes to node1
# add "add_number" to the strength of the connection of node2 as a neighbor of node1
def n2n2w_add(n2n2w,node1,node2,add_number):
# IF node1 has no neighbors yet,
if node1 not in n2n2w:
# THEN declare its neighbors as an empty dict
n2n2w[node1] = {}
# IF node2 has not yet been saved as a neighbor of node1,
if node2 not in n2n2w[node1]:
# THEN declare their connection weight as zero
n2n2w[node1][node2] = 0
# in all cases: add "add_int_value" to the link strength
n2n2w[node1][node2] += add_number
# ------------------------------
# copy data structure: key1 -> value list
def copy_k2vL_struct(k2vL_from,k2vL_to):
# loop over the list of keys
for key in k2vL_from:
# declare list for this key
k2vL_to[key] = []
# for the current key:
# copy all values
for value in k2vL_from[key]:
k2vL_to[key].append(value)
# ------------------------------
# copy data structure: key1->key2->value
def copy_k2k2v_struct(k2k2v_from,k2k2v_to):
# loop over the list of primary keys
for key1 in k2k2v_from:
# declare dict for the primary key
k2k2v_to[key1] = {}
# for the current primary key:
# loop over the list of secondary keys
for key2 in k2k2v_from[key1]:
# set key2-value
k2k2v_to[key1][key2] = k2k2v_from[key1][key2]
# ------------------------------
# return the two keys for which the value is maximal
def matrixMaxValueAndPosition(k2k2v):
# declare variables and set their default values: key1ofMax,key2ofMax,maxValue
my_list = list(k2k2v.keys())
key1ofMax = my_list[0]
my_list = list(k2k2v[key1ofMax].keys())
key2ofMax = my_list[0]
maxValue = k2k2v[key1ofMax][key2ofMax]
### test
#print("%g %s %s" % (maxValue,charConv(key1ofMax),charConv(key2ofMax))); sys.exit()
# loop over the list of primary keys
for key1 in k2k2v:
# loop over the secondary keys for this primary key
for key2 in k2k2v[key1]:
# IF the current value is higher than the
# value that we found so far to be the largest,
if k2k2v[key1][key2] > maxValue:
# THEN set the highest value
maxValue = k2k2v[key1][key2]
# AND set its indices
key1ofMax = key1
key2ofMax = key2
# return results
return maxValue, key1ofMax, key2ofMax
# ------------------------------
# agglomerative hierarchical clustering
# of the weighted network of links stored
# in nodePair2weight
#
# at each step, write the current clusters to the output file
def agglom_hier_clus(nodePair2weight,outFileClus):
# --- convert the data structure storing the network ---
# assemble n2n2w, where n2n2w{nodeA}{nodeB} is
# is the weight of the nodeA-nodeB link
# - note: in n2n2w all links are saved twice
n2n2w = {}
# loop over the list of links
for nodePair in nodePair2weight:
# the two nodes
nodeA, nodeB = list(nodePair)
# save nodeB as a neighbor of nodeA with the given weight
n2n2w_add(n2n2w,nodeA,nodeB,nodePair2weight[nodePair])
# save nodeA with a weight for nodeB
n2n2w_add(n2n2w,nodeB,nodeA,nodePair2weight[nodePair])
# --- declare data structures for clustering ---
# each node belongs to a separate cluster that has the index of the node
# n2c[n] (node to cluster) is the cluster index of node <n>
n2c = { _:_ for _ in n2n2w }
# each cluster contains a single node that has the index of the cluster
# c2nL[c] (cluster to node list) is the list of nodes in cluster <c>
c2nL = { _:[_] for _ in n2c }
# c2c2as[c1][c2] (cluster to cluster to average similarity):
# the average similarity between the items of clusters c1 and c2
# initially this data structure is the same as the weighted network
c2c2as = {}; copy_k2k2v_struct(n2n2w,c2c2as)
### test # print(c2c2as)
# t2cL[t][c] after time step t this is the list of items in cluster c
t2cL = []
# at t=0 the list of clusters is c2nL: save c2nL as the last item of the t2cL list
t2cL.append({}); copy_k2vL_struct(c2nL,t2cL[-1])
# --- agglomerative hierarchical clustering ---
# exit condition: all nodes are in the same cluster
while 1 < len(c2nL):
### test: print the number of clusters
#print("%d clusters" % len(c2nL))
# find the two clusters with the highest similarity
maxValue, cA, cB = matrixMaxValueAndPosition(c2c2as)
### test
#print("%g %s %s" % (maxValue,charConv(cA),charConv(cB)))
#print(c2c2as)
# merge these two clusters: change the book-keeping data structures
# - for each node of cluster cB:
for node in c2nL[cB]:
# set its cluster index to cA
n2c[node] = cA
# append it to the node list of cluster cA
c2nL[cA].append(node)
# - remove cluster cB from the cluster -> node list book-keeping
del c2nL[cB]
### test
#print(c2nL)
# remove the cluster similarity values of the removed cluster cB
# with all other clusters
# - NOTE that each cluster similarity value is saved in both directions
# (1) remove all cOther (other cluster) -> cB similarity values
for cOther in c2c2as[cB]:
del c2c2as[cOther][cB]
# (2) remove similarity values this way as well: cB -> all other clusters
del c2c2as[cB]
# re-compute the cluster similarity of cluster cA with all remaining clusters
# based on the n2n2w node-node similarity values
comp_selClusSim(n2n2w,n2c,c2nL,cA,c2c2as)
# save the current cluster list
t2cL.append({}); copy_k2vL_struct(c2nL,t2cL[-1])
# --- print the clusters for each step ---
# open the output file
with open(outFileClus,"w") as f:
# print file header
f.write("# Clusters during the successive steps of average-similarity based agglomerative hierarchical clustering\n")
#f.write("# Step number (0 means: before the start)\n#\tList of items in this cluster\n")
f.write("# List of items in each cluster\n")
f.write("# - clusters are separated with a single space character\n")
f.write("# - whitespace characters are escaped, e.g., the \'newline\' character is written as \\n \n")
f.write("# - clusters are written in the descending order of their sizes\n")
f.write("\n")
# print data: for each time step
for t in range(len(t2cL)):
# cluster index -> cluster size mapping, c2nn (cluster index to cluster size)
c2s = {}
for clus in (t2cL[t].keys()):
c2s[clus] = len(t2cL[t][clus])
# write the time step
#f.write("%d\t" % t)
# sorted(t2cL[t].keys()) is the sorted list of clusters at time t
# for each of these clusters: write the nodes of the cluster
# note: whitespace characters are replaced by their "escaped" versions
# note: clusters are listed in the descending order of their sizes
f.write( ' '.join( ''.join( sorted(list_escSpac(t2cL[t][clus])) ) for clus in sorted( c2s, key=c2s.get, reverse=True ) ) )
# close the current data line
f.write("\n")
# ------------------------------
# in the list of characters "char_list_in"
# replace \ , \t, \n, \f, \r characters by their escaped versions
def list_escSpac(char_list_in):
# define the mapping
char_map = { ' ':r'\ ', '\t':r'\t', '\n':r'\n', '\r':r'\r', '\f':r'\f' }
# declare the output list
char_list_out = []
# loop over the list of indexes in this list
for i in range(len(char_list_in)):
# IF the current character is a whitespace character,
if char_list_in[i] in list(char_map.keys()):
# THEN replace it with its escaped version
char_list_out.append(char_map[char_list_in[i]])
else:
char_list_out.append(char_list_in[i])
# return the output value
return char_list_out
# ------------------------------
# n2n2w[nA][nB] is the similarity of nodes nA and nB
# c2c2as[cA][cB] is the simimlarity of clusters cA and cB
# n2c[nA]: is the cluster index of node nA
# c2nL[cA]: is the list of nodes in cluster cA
#
# the cluster-cluster similarity of clusters cA and cB will be the average
# of node-node similarity values of the nodes nA and nB, for which
# nA is in cA, and nB is in cB
def comp_selClusSim(n2n2w,n2c,c2nL,c_sel,c2c2as):
# loop over the list of clusters other than the selected cluster
for c_other in c2nL:
if c_other != c_sel:
# IF either of the two clusters has no similarity value
# to any other cluster,
for c_now in c_other, c_sel:
if c_now not in c2c2as:
# THEN declare the dict of its similarity values to other clusters
c2c2as[c_now] = {}
# declare the list of nA-nB node-node similarities
# where nA is in c_sel and nB is in c_other
sim_value_list = []
# loop over the list of nodes in the selected cluster
for nA in c2nL[c_sel]:
# loop over the list of nodes in the other cluster
for nB in c2nL[c_other]:
### test
#print("%s %s" % (nA,nB))
# IF these two nodes do have a similarity index,
if nB in n2n2w[nA]:
# THEN add the similarity of these two nodes
# to the similarity of their clusters
sim_value_list.append(n2n2w[nA][nB])
# ELSE
else:
# add zero to the list
sim_value_list.append(0)
# compute the average of the node-node similarity values
# and save this average as the cluster-cluster similarity
c2c2as[c_sel][c_other] = np.mean(sim_value_list)
# save this value the other way, too
c2c2as[c_other][c_sel] = c2c2as[c_sel][c_other]
# ===== main =====
# explain distance/similarity-based iterative clustering
# group distance metric is calculated from pair distances:
# min, max, avg, geom.mean
# construct the frequency-weighted network of the
# character pair co-occurrences of the current file
#
# cp2n{charA charB} (character pair to number),
# is how many times the characters charA and charB
# appear next to each other in the current file
# - note: self-links are ignored
_cp2n = {}; comp_cp2n_noSelf(_thisFile,_cp2n)
# print network for visualization
write_undirWeightedNetwork(_cp2n,_outFileNw)
# agglomerative hierarchical clustering
# of the weighted network of character neighborships
agglom_hier_clus(_cp2n,_outFileClus)
2. How to use the code
python3 clus.py charNw.txt clusList.txt
3. Output files
Weighted network of characters: charNw.txt
List of clusters after each step: clusList.txt