+ indep. WoS citations

Python and Networks - Homework - 2016-03-22

Problem: From today's class notes download links.txt, which is the list of hyperlinks going within the domain www.elte.hu. Ignore the directions of the links. Plot the ccdf of the node degrees of this network. As a reference, plot also the ccdf of the node degrees of the SF model.

Solutions:

1.  FIJ

1.1  Python code to convert measured link list to CCDF (links-to-ccdf.py)

import sys, re

# === function definitions ===

def readLinksFromStdin_writeNodeDegCCDFtoStdout():
    '''Read links from stdin. Write node degree ccdf to stdout.'''

    # ====== node degrees ======

    # the degree of each node
    node2deg = {}

    # read all lines from stdin
    for line in sys.stdin:

        # discard comment and empty lines
        if not line.startswith("#") and not re.search(r'^\s*$', line):

            # take both nodes
            for node in re.findall(r'(\S+)', line):

                # IF we have not yet seen this node, THEN set its degree to zero
                if not node2deg.has_key(node):
                    node2deg[node] = 0

                # increment the degree of this node
                node2deg[node] += 1

    # ====== histogram of node degrees ======

    # histogram of node degrees: hist[d]: number of occurrences of the node degree d
    hist = {}

    # loop through the list of nodes
    for node in node2deg:

        # IF we have not yet seen the degree of this node,
        # THEN set the number of occurrences of this degree to zero
        if not hist.has_key(node2deg[node]):
            hist[node2deg[node]] = 0

        # increment the number of occurrences of this degree by 1
        hist[node2deg[node]] += 1

    # ======= print output ======

    # print output header
    print "# Node degree, 0 replaced with \"-\""
    print "#\tNode degree, 0 included"
    print "#\t\tCCDF"
    print ""

    # the number of nodes:
    numberOfNodes = len(node2deg)

    # ccdf[d]: the probability for a node to have degree higher than d
    # compute the ccdf from the histogram
    # print output data
    ccdfNow = 1.0

    # loop through the keys of the histogram in ascending order
    for nodeDegree in sorted(hist):

        # print nonzero node degree or "-"
        if 0 == nodeDegree:
            sys.stdout.write("-\t")
        else:
            sys.stdout.write("%d\t" % nodeDegree)

        # print node degree and CCDF only if CCDF is not 0 + numerical error
        if abs(ccdfNow) > numberOfNodes**(-2.0):
            print "%d\t%.5g" % (nodeDegree,ccdfNow)

        # step to next node degree
        ccdfNow -= hist[nodeDegree]/(1.0*numberOfNodes)

# === main ===

readLinksFromStdin_writeNodeDegCCDFtoStdout()

1.2  Gnuplot command file (real-vs-sf.gnu)

# settings
se term posts lands color enh dash "Helvetica" 22
se log xy
se xlab 'Node degree'
se ylab "CCDF of node degrees"
se grid
se o 'real-vs-sf.ps'
se xtic ("1" 1, "10" 10, "100" 100, "10^{&{.}3}" 1000, "10^{&{.}4}" 1e+4, "10^{&{.}5}" 1e+5)
se ytic ("1" 1, "10^{{/Symbol \055}&{.}2}" 0.01, "10^{{/Symbol \055}&{.}4}" 1.0e-4, "10^{{/Symbol \055}&{.}6}" 1.0e-6)

# plotting
p [1:1e+5][2.5e-7:4] \
\
'elte-net-ccdf.txt' u 1:3 title 'Measured (small)'   w p pt 2 ps 1.5 lt 1 lw 3, \
\
'sf-ccdf.txt'       u 1:3 title 'Scale-Free (large)' w p pt 1 ps 1.5 lt 3 lw 3

1.3  How to use the Python code and the gnuplot command file

The input files are:

  • links.txt -- list of hyperlinks within the domain www.elte.hu
  • sf-ccdf.txt -- CCDF of the node degrees of a large Scale-Free graph
python links-to-ccdf.py < links.txt > elte-net-ccdf.txt
gnuplot real-vs-sf.gnu # the output is real-vs-sf.ps

1.4  Output (image)

2.  DG

2.1  Python code of another crawler (web-crawler.py)

import sys
import urllib
import urlparse
import HTMLParser

script, main_url = sys.argv

# Only the domain name and its subdomain is important.
base_url = urlparse.urlparse(main_url).netloc
if base_url.startswith("www."):
    base_url = base_url[4:]

# Urls as nodes. This associative array has all the urls, and a list that contains all the urls it's pointing to.
# node_neighbors = { url1: { url2, url3, ... }, url4: { ... }, ... }
node_neighbors = {}

# URL parser class
class parseUrls(HTMLParser.HTMLParser):
    # Declare self.url_list
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self.url_list = set()

    # Starting of a tag,
    # we filter out only the < a ... > tag.
    def handle_starttag(self, tag, attrs):
        if tag == "a":
            for attr in attrs:
                if attr[0] == "href":
                    self.url_list.add(attr[1])

def getAllUrls(url):
    # Instance of the parser.
    parser = parseUrls()

    # Feed data into the parser.
    handle = urllib.urlopen(url)
    parser.feed(unicode(handle.read(), errors = 'ignore'))
    parser.close()

    return parser.url_list

# Filter urls to include only the urls that are in the base_url domain.
def filterUrls(url_list):
    new_url_list = set()
    for url in url_list:

        # Do not include other files.
        if url.endswith(".jpg") or url.endswith(".png") or url.endswith(".ppt") or url.endswith(".pptx") or url.endswith(".rtf") or url.endswith(".pdf") or url.endswith(".doc") or url.endswith(".docx"):
            continue

        # Do not include https.
        if url.startswith("https://"):
            continue

        if url.startswith("//"):
            url = "http:" + url

        parsed_url = urlparse.urlparse(url)

        #if parsed_url.netloc.endswith(base_url):
        #    # If it is in the subdomain, we add to the new list.
        #    new_url_list.add(url)
        #elif url.startswith("/"):
        #    # If it only has a path, it's a relative path, we include this too.
        #    new_url_list.add(main_url + url)

        # Only include the same full domain.
        #print parsed_url.netloc
        if parsed_url.netloc == base_url or (parsed_url.netloc == "www." + base_url):
            new_url_list.add(url)

    return new_url_list

# Crawls the url and its urls.
# If the url is already in the node_neighbors, it stops.
def crawl_url(start_url):
    if node_neighbors.has_key(start_url):
        return

    # We already started crawling, at deeper levels we don't get "back" here.
    node_neighbors[start_url] = set()

    print(">> Crawling url '" + start_url + "'...")

    # Getting all the urls from start_url,
    # and filter them.
    url_list = getAllUrls(start_url)
    url_list = filterUrls(url_list)

    # Going through all the urls.
    for url in url_list:
        crawl_url(url)

    node_neighbors[start_url] = url_list

    print("<< Done crawling url '" + start_url + "'.")

crawl_url(main_url)

2.2  Python code of converter: Link list → CCDF of node degrees

# Calculates complementary comulative distribution function
# of the degree distribution of a network from a file that has the format
# <node1> <node2>

import sys

# Number of all the nodes is the first argument
script, n_nodes = sys.argv
n_nodes = int(n_nodes)

# Associative array of the nodes
# degree[node] = degree
degrees = {}

# Read in all the links, line by line
for line in sys.stdin:
    if line[0] == '#':
        continue

    linedata = line[:-1].split("\t")
    node1 = linedata[0]
    node2 = linedata[1]

    # If a node is already in the degrees array,
    # number of degrees is incremented,
    # otherwise it's degree is set to one
    if degrees.has_key(node1):
        degrees[node1] += 1
    else:
        degrees[node1] = 1

    # If every link reads twice (once one way, once other way, we do not need this part.
    #if degrees.has_key(node2):
    #    degrees[node2] += 1
    #else:
    #    degrees[node2] = 1

# Calculates maximum number of degrees,
# so we can allocate the array for storing the distribution.
max_degrees = 0
for k in degrees:
    if degrees[k] > max_degrees:
        max_degrees = degrees[k]

# Allocating the array for storing the distribution
distribution = []
for i in range(0, max_degrees + 1):
    distribution.append(0)

# Going through every node,
# incrementing its degree by one in the distribution array
for k in degrees:
    distribution[degrees[k]] += 1.0 / n_nodes

distribution[0] = 1.0 * (n_nodes - len(degrees)) / n_nodes
#print distribution[0]

# Printing out the degree distribution (for debug)
#for i in range(0, len(distribution)):
#    print(str(i) + " " + str(distribution[i]))

# Allocating the array for storing the ccdf
ccdf = []
ccdf_currently = 1
for i in range(0, max_degrees + 1):
    ccdf.append(ccdf_currently)
    ccdf_currently -= distribution[i]

# Printing out the ccdf
for i in range(0, len(ccdf)):
   print(str(i) + " " + str(ccdf[i]))

2.3  Output (image)

3.  KZS

3.1  Python code

#-*- coding: utf-8 -*-
import sys, re
import random as rnd

#-------------read the url-network from a file-------------------------------------------
def readNetworkFromFile(inFileStream):
  node2neiSet = {}
  nodeDegrees = {}
  someSet = set()

  for line in inFileStream.readlines():
    if re.search('^\d', line):
      nodesCon = re.findall('[0-9]+', line)
      #print nodesCon[0], nodesCon[1] #we found an edge!

      for node in nodesCon:       #updating the degrees of the two nodes:
	if nodeDegrees.has_key(node):
	  nodeDegrees[node] += 1
	else: 
	  nodeDegrees[node] = 1

	if not node2neiSet.has_key(node):
	  node2neiSet[node]=set()


      node2neiSet[nodesCon[0]].add(nodesCon[1])
      node2neiSet[nodesCon[1]].add(nodesCon[0])

  return nodeDegrees

#-------------simulating a scale-free network for comparison---------------------
def addLink(nodeIndexList, node2neiSet, node1, node2):
  for node in (node1, node2):
    nodeIndexList.append(node)   

  #if a node has no neighbours, declare its neighbour-set:
  for node in (node1, node2):
    if not node2neiSet.has_key(node): 
      node2neiSet[node] = set()

  #add the nodes to each others neighbour-list:
  node2neiSet[node1].add(node2)
  node2neiSet[node2].add(node1)


def sf_graf_return_nodeDegree(N, m):
  #kell egy lista: minden csúcs indexe annyiszor, amennyi a fokszáma:
  nodeIndexList = []

  # a hálózatot tárolja:
  node2neiSet = {}

  #inicializálunk: m csúccsal, klik-ben
  if m < 2:
    sys.exit("The parameter m has to be at least 2")

  for node1 in range(m):
    node2 = node1+1  #itt eltértem az el&#337;írt függvényt&#337;l!
    while node2 <= (m-1):
     addLink(nodeIndexList, node2neiSet, node1, node2)
     node2 += 1

  # elvégezzük a t. lépést: t=1-t&#337;l 
  for t in range(N-m):
    selOldNodeSet = set() #ezek azok a csúcsok, amikhez kötni fogjuk az új csúcsot! - SET-ben, hogy semmi se legyen meg 2x
    while len(selOldNodeSet) < m:
      selOldNodeSet.add(nodeIndexList[rnd.randint(0, len(nodeIndexList)-1)])

    #adding the new edge, with index m+t
    for i in selOldNodeSet:
      addLink(nodeIndexList, node2neiSet, i, m+t)

  #at the end: get the node degrees:
  nodeDegrees = {}
  for currentNode in node2neiSet.keys():
    nodeDegrees[currentNode] = len(node2neiSet[currentNode])

  return nodeDegrees  #return value: the degrees of all nodes


#-------------getting the ccdf from the node degrees ---------------------------------------
def getCompDist(nodeDegrees): #nodeDegrees has to be a dictionary!!!!

  complementearyDistFunction = [0] * (max(nodeDegrees.values())+1) #a fokszám max. az élek száma lehet, a minimuma nulla

  for degree in range(max(nodeDegrees.values())+1): #L-t&#337;l nulláig végigmegyünk a  lehetséges fokszámokondf
    for nodeNmbr in nodeDegrees: #az adott fokszámra leszámolom, hogy hány csúcsnak van pont akkora, vagy nagyobb fokszáma
      if(nodeDegrees[nodeNmbr] >= degree):
	complementearyDistFunction[degree] += 1

    complementearyDistFunction[degree] /= float(len(nodeDegrees))  

  return complementearyDistFunction

#----------printing the ddcf function------------------------------------------------------------
def printCompDist(nodeDegrees, complementearyDistFunction, outFileStream):
  outFileStream.write("#Degree\tCCDF\n")
  for i in range(max(nodeDegrees.values())+1):
    outFileStream.write("%d\t%f\n" % (i, complementearyDistFunction[i]))

#-------------------main----------------------------------

script, inFileName, outFileName, m = sys.argv
m = int(m)

inFile = open(inFileName, 'r')
outFile = open(outFileName, 'w')

#read url-network from infile:
nodeDegrees_urlNet = readNetworkFromFile(inFile)
print "Max degree in url-net:", max(nodeDegrees_urlNet.values())
#create sf-network with the same number of nodes:
nodeDegrees_sfNet = sf_graf_return_nodeDegree(len(nodeDegrees_urlNet.keys()), m)
print "Max degree in sf-net:", max(nodeDegrees_sfNet.values())

ccdf_urlNet = getCompDist(nodeDegrees_urlNet)
ccdf_sfNet = getCompDist(nodeDegrees_sfNet)

#printCompDist(nodeDegrees, ccdf, outFile)
outFile.write("#Degree\tCCDF_url\tCCDF_sf\n")
for i in range(1, max(max(nodeDegrees_urlNet.values()), max(nodeDegrees_sfNet.values()))+1, 1):
  #if one of the max degrees is bigger than the other: (in other words, the ccdf function has values for bigger degrees)
  if len(ccdf_urlNet) <= i:
    ccdf_urlNet.append(0)
  if len(ccdf_sfNet) <= i:
    ccdf_sfNet.append(0)
  outFile.write("%d\t%f\t%f\n" % (i, ccdf_urlNet[i], ccdf_sfNet[i]))

outFile.close()

3.2  Output (image)

4.  JM

4.1  Python code: generating the network

import sys,random

# ========= function definitions =========

def addLink(nodeIndxList, node2neiSet, node1, node2):
    '''Insert a link into the two book-keeping data structures'''

    # for both nodes:
    for node in (node1,node2):
        # add the two node indexes to the list of stub (link end point) indexes
        nodeIndxList.append(node)
        # if this node has no neighbors yet, then declare the set of its neighbors
        if not node2neiSet.has_key(node):
            node2neiSet[node] = set()

    # add the 2nd node to the set of neighbors of the 1st node
    node2neiSet[node1].add(node2)
    # same for node 1
    node2neiSet[node2].add(node1)

# --------------------------------------

def sf(n,m):    
    '''Generate a Scale-Free graph with parameters N and m'''

    # --- variables ---
    # a list containing each node as many times as its degree
    nodeIndxList = []    
    # for each node the set of its neighbors
    node2neiSet = {}


    # --- testing parameters and initializing ---
    # m >= 2
    if m < 2:
        sys.exit("The parameter m has to be at least 2")
    # connect each pair among the first m nodes
    for node1 in range(m):
        node2 = node1
        while node2 <= m-1:
            addLink(nodeIndxList,node2neiSet,node1,node2)        
            node2 += 1

    # --- iteratively: in each step add 1 new node and its m links ---
    for t in range(n-m):
        # select m old nodes to which the new node will be connected
        # select each old node with a probability proportional to its degree
        selOldNodeSet = set()
        # a set grows only when we add a different value, so
        # we will leave the while loop only after randomly selecting m different values
        while m > len(selOldNodeSet):
            # select one node index from the list of node indexes
            selOldNodeSet.add(nodeIndxList[random.randint(0, len(nodeIndxList)-1)])

       # printing the current node degree values vs time
#       if (t+1 > 100):
#           print t+1, len(node2neiSet[0]), len(node2neiSet[98])  
#        else:
#            print t+1, len(node2neiSet[0])  
        # connect the new node to each of the selected old nodes
        for i in selOldNodeSet:
            # the index of the new node is m+t
            addLink(nodeIndxList, node2neiSet, i, m+t)
             print node1, m+t

    # --- return value: the network ---
    return node2neiSet

# ========= main =========

# generate the network

n = 100000
m = 2

node2neiSet = sf(n,m)

4.2  Python code: converting degree list to the CCDF of the degrees

import re

# a function to create the cumulative degree distribution based on degreeList
def getCumulativeDegreeDistribution(degreeDistribution, CCDF, largestNode):
     for i in range(len(degreeDistribution)):
        CCDF[i] = 1 - sum(1 for j in degreeDistribution if j < i) / float(largestNode+1)

# get the nodelist
nodeList = [] 
for line in open("sflinks.txt"):
    columns = line.split(" ")
    if re.match("\d", columns[0]):
       nodeList.append(int(columns[0]))

#get the degree distribution
largestNode = int(max(nodeList))
print largestNode
degreeDistribution = [0] * (largestNode + 1)
for node in nodeList:
    degreeDistribution[node]+=1

# get the CCDF
CCDF = [0] * (largestNode + 1) 
getCumulativeDegreeDistribution(degreeDistribution, CCDF, largestNode)  

for i in range(len(CCDF)):
    print CCDF[i]

4.3  Output (image)

5.  BGS

5.1  Python code

import sys, re

# === function definitions ===

def readLinksFromFile_writeNodeDegCCDFtoFile(inF):
    nodeg = {}
    #
	# osszes sor
    for line in inF.readlines():
        # csak a szamot tartalmazo sorok
        if not line.startswith("#") and not re.search(r'^\s*$', line):
            # vegpontok
            for node in re.findall(r'(\S+)', line):
                # ha latjuk noveljuk az adott csucs foksz.
                if nodeg.has_key(node):
                    nodeg[node] += 1
                else:   
                # nem novelj.
                    nodeg[node] = 0


    return nodeg 

def comdistfunc(nodeg):
    #
    maximum=0         # maxim erteku elem keresese
    for i in nodeg:	
        if (nodeg.has_key(i) and nodeg[i] > maximum):    
            maximum=nodeg[i]   

    ccdf =[0]*(maximum+1)           # fokszamok 0 es elek szama koz.
    for degree in range(maximum+1):
        for node in nodeg:
            if(nodeg[node] >= degree):  # hany csucsnak van annyi vagy nagyobb fokszama --> CCDF
                ccdf[degree]+=1
        ccdf[degree]/= float(len(nodeg))        #normalas
    return ccdf


# === main ===
inF=open('sflinks.txt','r')  #beolvasott file
ouF=open('link_elte.txt','w')  #file to write
ccdf=comdistfunc(readLinksFromFile_writeNodeDegCCDFtoFile(inF))	
for j in range(len(ccdf)):
	ouF.write("%d\t %lf\r\n" %(j,ccdf[j]))	

5.2  Output (image)

6.  SZT

6.1  Python script (hw5.py)

# Python programming and networks - Homework 5
# Szorenyi Tamas
# From today's (2016.03.22.) class notes download links.txt, which is the list of hyperlinks going within the domain www.elte.hu.
# Ignore the directions of the links. Plot the ccdf of the node degrees of this network.
# As a reference, plot also the ccdf of the node degrees of the SF model.

# This file only contains the link list --> CCDF conversion
# The CCDF of the SF model can be found in ccdf-sf.py, which was taken from 2016.03.03's class notes.

from sys import argv

script, InputFile = argv

# Read link list into dictionary
Links_Dict = {}
Nodes = 0 # Total number of nodes

with open(InputFile, 'r') as f:
	for line in f:
		if not line.startswith('#') and not line == '\n':
			splitLine = line.split()
			TempKey = int(splitLine[0])
			TempValue = int(splitLine[1])
			if not Links_Dict.has_key(TempKey):
				Links_Dict[TempKey] = set()
				Nodes += 1
			Links_Dict[TempKey].add(TempValue)
			#print splitLine[0] # Test
#print Links_Dict # Test
#print Nodes # Test

# Create CCDF dictionary (histogram)
CCDF_Dict = {}
for i in Links_Dict.keys():
	NodeNeighbourCount = len(Links_Dict[i])
	if not CCDF_Dict.has_key(NodeNeighbourCount):
		CCDF_Dict[NodeNeighbourCount] = 0
	CCDF_Dict[NodeNeighbourCount] += 1
#print CCDF_Dict # Test

# Printing results
print "# Node degree"
print "#	CCDF of node degrees"
CurrentCCDF = 1.0
for j in sorted(CCDF_Dict):
	if abs(CurrentCCDF) > Nodes**(-2.0):
		print "%d\t%.5f" % (j, CurrentCCDF)
        CurrentCCDF -= CCDF_Dict[j]/(1.0*Nodes)

6.2  Shell script

The script ccdf-sf.py comes from our 2016-03-03 class.

#!/bin/bash

python hw5.py 2016-03-21-www_elte_hu-links.txt > hw5measured.txt
python ccdf-sf.py 100000 2 > hw5sf.txt

gnuplot -persist << EOPLOT

set terminal png
set xlabel "Node degree"
set ylabel "CCDF of node degrees"
set logscale xy
set xrange [0.5:1500]
set yrange [0.000005:1.5]
set grid
set key box
set out "hw5plot.png"
plot "hw5measured.txt" w p lw 2 t "Measured", "hw5sf.txt" u 2:3 w p lw 2 t "Scale-free"

EOPLOT

6.3  Output image