+ indep. WoS citations

Python and Networks -- Class notes -- 2016-03-22

1.  Spider to map the network of www.elte.hu pages

1.1  Python code (spider.py)

import re, sys, urllib, urllib2, time

# ------- read parameters -------

# URL to start from, maximum number of pages (nodes), outFiles: nodes, links
script, firstUrl, nUrlMax, outFileNodes, outFileLinks = sys.argv
nUrlMax = int(nUrlMax)

# ------- declare variables -------

id2url = [] # ID->URL of downloaded pages
url2id = {} # URL->ID of     -"-
urlsToDownload = [] # list of URLs to download
nodeId2outNeiIdSet = {} # for each node (webpage) the set of its out-neighbors

# ------- how to add one node (url) to the node book-keeping ---------

def getNodeId_savePrintIfNew(url, id2url, url2id, urlsToDownload, oFileStream):
    '''Get ID of a node (url)
       If the node is new, then add+print it to the book-keeping'''

    # if we have not yet seen this URL, then add it to the book-keeping
    if not url2id.has_key(url):

        # ID of the new node: the next available ID
        # set ID->URL mapping for the new URL
        id2url.append(url)

        # set URL->ID mapping for the new URL
        url2id[url] = len(id2url)-1

        # if the URL is new, THEN save that it should be downloaded
        urlsToDownload.append(url)

        # print the ID -> URL mapping of the new node
        oFileStream.write("%d\t%s\n" % (url2id[url],url))

    # return the ID of the node (URL)
    return url2id[url]

# -------- get the hyperlinks on a webpage --------

def url2outNeiUrlSet_only_www_elte(url,nDownloaded):
    '''Download webpage, return the set of html out-links to elte.hu'''

    # return this set
    outNeiUrlSet = set()

    # website of the URL
    siteOfUrl = re.findall(r'(http://[^\/\"]+)', url).pop(0)    

    # try to read the contents of the URL into "htmlPage"
    htmlPage = None
    try:
        htmlPage = urllib.urlopen(url).read()
    # handle error
    except urllib2.URLError, e:
        print "Error, cannot read \"%s\"" % url

    # continue only if we could read the html page 
    if htmlPage:

        # change counter: downloaded another page
        nDownloaded += 1

        # extract the body of the html page
        htmlPageBody = re.findall(r'\<body.*?\>(.+?)\<\/body\>', htmlPage, flags=re.IGNORECASE|re.DOTALL)[0]

        # analyze the body of the HTML page
        # the \/?\" removes the trailing / character
        # loop through the list of URLs to which the current page points
        for targetUrl in re.findall(r'href=\"(.+?)\/?\"', htmlPageBody, flags=re.IGNORECASE):

            # filter the linked URLs (target URLs)
            if( 
                # discard files (URLs ending on a dot and letters)            
                not re.search(r"\.[a-zA-Z]+$", targetUrl) and
                # do not allow email links
                not re.search(r"^mailto", targetUrl) and
                # do not allow URLs containing parameters
                not re.search(r"(\?|\&)", targetUrl) and
                # allow only www.elte.hu URLs
                    re.search(r"www\.elte\.hu", targetUrl) ):

                # if the target URL starts with a //, THEN prepend http: to it
                if( re.search(r'^\/\/', targetUrl ) ):
                    targetUrl = "http:" + targetUrl

                # else if the target URL starts with /, THEN prepend site of source URL
                elif( re.search(r'^\/', targetUrl ) ):
                    targetUrl = siteOfUrl + targetUrl

                # remove bookmarks
                targetUrl = re.sub(r'\#.*?$', r'', targetUrl)

                # remove index.(html|htm|cgi|php) from the end of the URL
                targetUrl = re.sub(r'\/index\.[a-z]+?$', r'', targetUrl) 

                # save target URL
                outNeiUrlSet.add(targetUrl)

    # return value
    return outNeiUrlSet

# ------- the spider for mapping a portion of the WWW ----------

def spider(id2url, url2id, urlsToDownload, nodeId2outNeiIdSet, nUrlMax, outFileNodes, outFileLinks):
    '''Map a portion of the WWW with the bubble method'''

    # -------- open outfiles, print their headers -----------

    # nodes
    oNodes = open(outFileNodes, 'w')
    oNodes.write("# ID->URL of webpage network\n# ID\n#\tURL\n\n")

    # edges
    oLinks = open(outFileLinks, 'w')
    oLinks.write("# ID->ID of webpage network\n# Source node ID\n#\tTarget node ID\n\n")

    # ------- initialize variables ---------

    # insert the first URL into the book-keeping
    idNew = getNodeId_savePrintIfNew(firstUrl, id2url, url2id, urlsToDownload, oNodes)

    # count how many pages we have downloaded so far
    nDownloaded = 0

    # ----- loop: while below max page number, download next webpage ------

    # check also if we still have URLs to download
    while len(id2url) < nUrlMax and 0 < len(urlsToDownload):

        # next URL to download
        urlNow = urlsToDownload.pop(0)

        # the set of the current node's out-neighbors, only from the elte.hu domain
        outNeiUrlSet = url2outNeiUrlSet_only_www_elte(urlNow,nDownloaded)

        # wait for 1 sec after every 5th download
        if 1 == divmod(nDownloaded,5)[1]:
            time.sleep(1)

        # IF there are out-neighbors, THEN:
        if 0 < len(outNeiUrlSet):

            # declare the set of the current node's out-neighbors
            nodeId2outNeiIdSet[url2id[urlNow]] = set()

            # save out-neighbor nodes to the book-keeping
            for outNeiUrl in outNeiUrlSet:

                # only if we have not yet reached the requested number of nodes
                if len(id2url) < nUrlMax:

                    # get the ID of the out-neighbor
                    # IF it is new, THEN save and print it
                    getNodeId_savePrintIfNew(outNeiUrl, id2url, url2id, urlsToDownload, oNodes)

                    # save out-neighbors of the current node
                    nodeId2outNeiIdSet[url2id[urlNow]].add(url2id[outNeiUrl])

            # print the out-neighbors of the current node
            for outNeiId in sorted(nodeId2outNeiIdSet[url2id[urlNow]],key=int):
                oLinks.write("%d\t%d\n" % (url2id[urlNow],outNeiId))

    # close the output files
    oNodes.close()
    oLinks.close()

# ------- main ---------

spider(id2url, url2id, urlsToDownload, nodeId2outNeiIdSet, nUrlMax, outFileNodes, outFileLinks)

1.2  How to use the code

python spider.py http://www.elte.hu 1000 nodes.txt links.txt >o 2>e&

1.3  Output files