"Wikipedia is a free encyclopedia, written collaboratively by the people who use it. It is a special type of website designed to make collaboration easy, called a wiki. Many people are constantly improving Wikipedia, making thousands of changes per hour. All of these changes are recorded in article histories and recent changes."

The collaborative nature of Wikipedia creates an open-source environment that allows users to track edits as they are made in real-time. The edits stream at the recent changes page:

https://en.wikipedia.org/wiki/Special:RecentChanges

Using the live updates from the page above, I will demonstrate how to crawl Wikipedia using Python and display the topic of each entry/edit. In this post, I will create a Word Cloud (tag cloud) that displays words in a random configuration on a figure. The word clouds will include color, rotation angle, and size variations; but they will not include shape boundaries as is often done (this requires another dimension of difficulty that will not be covered here). Word clouds are used as visual marketing tools for disseminating information in a visually pleasant way and can be used to add another layer of visualization to an existing product/website/presentation.

Creating a Random Word Cloud

import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

# function for live-plotting
def live_plotter(ax,words,x_loc,y_loc,color,size_input,rotation):
    if ax==[]:
        plt.ion()
        fig = plt.figure(figsize=(11,6),facecolor='#3b3b3b')
        ax = fig.add_subplot(111,frameon=False)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        annot1 = []
        plt.show()
        return ax,annot1
    
    annot1 = ax.annotate(words,xy=(x_loc,y_loc),fontsize=size_input,xycoords='figure pixels',color=color,rotation=rotation)
    plt.pause(0.01)
    
    return ax,annot1

# initialize the figure so that we can loop and add words
ax = []
font_pixel_density = 20
ax,annot1 = live_plotter(ax,' ',0.0,0.0,'k',font_pixel_density,0)
fig = ax.get_figure()
fig_size = fig.get_dpi()*fig.get_size_inches()
# choose colormap
cmap = matplotlib.cm.get_cmap('tab20c')

# random word vector - insert words in parentheses below
lorem_ipsum = " insert words here"

# splitting word based on spaces
words_for_cloud = (lorem_ipsum.strip(',')).split(' ')
# loop through words
for ii in range(0,len(words_for_cloud)):
    annot_text = words_for_cloud[ii]
    # find the size of the word in pixels
    annot_x_size = len(words_for_cloud[ii])*font_pixel_density
    # randomize the location of the word
    zoom_ratio = 0.95
    x_loc = random.uniform(0.0+zoom_ratio,1.0-zoom_ratio)*(fig_size[0]-annot_x_size)
    y_loc = random.uniform(0.0+zoom_ratio,1.0-zoom_ratio)*(fig_size[1]-font_pixel_density)

    # randomize the color,rotation angle, and size of the word text
    color =  matplotlib.cm.colors.to_hex(cmap(np.random.rand(1))[0])
    rotation = random.uniform(-1, 1)*30
    size_var = random.uniform(0.4,1)*font_pixel_density

    # here is the loop for checking whether the new annotation interferes with others - if it does
    # we remove the old text
    prev_children = ax.get_children()
    ax,annot1 = live_plotter(ax,annot_text,x_loc,y_loc,color,size_var,rotation)
    for ii in prev_children:        
        try:
            jj = (annot1.get_window_extent()).extents

            if jj[2]-ii.get_window_extent().extents[0]>=0 and ii.get_window_extent().extents[2]-jj[0]>=0 and\
               jj[3]-ii.get_window_extent().extents[1]>=0 and ii.get_window_extent().extents[3]-jj[1]>=0:
                ii.remove()                
        except:
            pass

The word cloud above utilizes rotation, size variation, and color mapping all using Python and a simple matplotlib annotation. There are many possibilities from here involving word clouds, and one such example using Wikipedia will be covered in the next section.

UPDATE: Newer and nicer code using spiral format below

import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

# function for live-plotting
def live_plotter(ax,words,x_loc,y_loc,color,size_input,rotation):
    if ax==[]:
        plt.ion()
        fig = plt.figure(figsize=(11,6),facecolor='#3b3b3b')
        ax = fig.add_subplot(111,frameon=False)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        annot1 = []
        plt.show()
        return ax,annot1
    
    annot1 = ax.annotate(words,xy=(x_loc,y_loc),fontsize=size_input,xycoords='figure pixels',color=color,rotation=rotation)
    plt.pause(0.01)
    
    return ax,annot1

# initialize the figure so that we can loop and add words
ax = []
font_pixel_density = 20
ax,annot1 = live_plotter(ax,' ',0.0,0.0,'k',font_pixel_density,0)
fig = ax.get_figure()
fig_size = fig.get_dpi()*fig.get_size_inches()
# choose colormap
cmap = matplotlib.cm.get_cmap('tab20c')

# random word vector
lorem_ipsum = """Lorem ipsum dolor sit amet, consectetur adipiscing elit"""

# splitting word based on spaces
words_for_cloud = (lorem_ipsum.strip('.')).split(' ')
# loop through words
for ii in range(0,len(words_for_cloud)):
    annot_text = words_for_cloud[ii]
    # find the size of the word in pixels
    annot_x_size = len(words_for_cloud[ii])*font_pixel_density
    # randomize the location of the word
    size_weight_x = ((ii)/len(words_for_cloud))*0.8
    size_weight_y = ((ii)/len(words_for_cloud))*0.8
    spiral_speed = 0.1
    x_loc = (fig_size[0]/2)+(random.choice([random.uniform(-size_weight_x-spiral_speed,0),random.uniform(0,size_weight_x+spiral_speed)])*(fig_size[0]/2))
    y_loc = (fig_size[1]/2)+(random.choice([random.uniform(-size_weight_y-spiral_speed,0),random.uniform(0,size_weight_y+spiral_speed)])*(fig_size[1]/2))

    # randomize the color,rotation angle, and size of the word text
    color =  matplotlib.cm.colors.to_hex(cmap(np.random.rand(1))[0])
##    rotation = random.uniform(-1, 1)*30
    rotation = random.choice([-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1])*90
    size_var = (random.uniform(0.4,1)*font_pixel_density)
    
    # here is the loop for checking whether the new annotation interferes with others - if it does
    # we remove the old text
    prev_children = ax.get_children()
    ax,annot1 = live_plotter(ax,annot_text,x_loc,y_loc,color,size_var,rotation)
    for kk in prev_children:
        if isinstance(kk, matplotlib.text.Annotation):            
            kk_extents = kk.get_window_extent().extents
            jj = (annot1.get_window_extent()).extents

            if kk_extents[2]-jj[0]>0 and jj[2]-kk_extents[0]>0 and kk_extents[3]-jj[1]>0 and\
                  jj[3]-kk_extents[1]>0:
                annot1.remove()
                break
            elif jj[2]>fig_size[0]:
                annot1.set_position((fig_size[0]-jj[2],jj[3]))
            elif jj[3]>fig_size[1]:
                annot1.set_position((jj[2],fig_size[1]-jj[3]))

plt.savefig('word_cloud_spiral.png',facecolor=fig.get_facecolor(), edgecolor='none',dpi=95)

Word cloud created using the spiral method

Word Cloud Visualization of Wikipedia Edits

Now that we have the word cloud generator working in conjunction with the live-plotter, we can use the Wikipedia recent changes page to test the function in real-time. The url that we need to crawl in order to view recent changes is:

https://en.wikipedia.org/wiki/Special:RecentChanges?hidebots=0&hidecategorization=1 &hideWikibase=1&hidelog=1&limit=50 &days=1&urlversion=1

In order to crawl the above page, I use the following Python method that gets webpage information via the requests library:

import requests

def html_crawl(key1,content):

    wiki_indx = [i+len(key1) for i, j in enumerate(content) if content[i:i+len(key1)] == key1]

    wiki_titles = []
    end_key = '">'
    for ii,indx in enumerate(wiki_indx):
        for mm in range(0,200):
            if content[indx+mm:indx+mm+len(end_key)]==end_key:
                wiki_titles.append(content[indx:indx+mm])
                break

    return wiki_titles

def wiki_crawl():
    page = requests.get('https://en.wikipedia.org/wiki/Special:RecentChanges?hidebots=0&hidecategorization=1&hideWikibase=1&hidelog=1&limit=50&days=1&urlversion=1')
    content = page.text
    key1 = 'class="mw-changeslist-diff" title="'

    wiki_edits = html_crawl(key1,content)
    return wiki_edits

if __name__=="__main__":
    wiki_edits = wiki_crawl()

The above code will crawl Wikipedia's recent changes page for edits made by users and return the title of each page edited.

If everything looks good, we now want to take those titles and create a dynamic word cloud. We can do so by combining the Wikipedia page crawl and the word cloud above and use a while-loop to incorporate Wiki edit titles on the recent changes page. The word cloud result should look similar to the .gif below:

The full code used to create the dynamic word cloud can be found below (click to expand):

import requests,random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

def html_crawl(key1,content):

    wiki_indx = [i+len(key1) for i, j in enumerate(content) if content[i:i+len(key1)] == key1]

    wiki_titles = []
    end_key = '">'
    for ii,indx in enumerate(wiki_indx):
        for mm in range(0,200):
            if content[indx+mm:indx+mm+len(end_key)]==end_key:
                wiki_titles.append(content[indx:indx+mm])
                break

    return wiki_titles

def wiki_crawl():
    page = requests.get('https://en.wikipedia.org/wiki/Special:RecentChanges?hidebots=0&hidecategorization=1&hideWikibase=1&hidelog=1&limit=50&days=1&urlversion=1')
    content = page.text
    key1 = 'class="mw-changeslist-diff" title="'

    wiki_edits = html_crawl(key1,content)
    return wiki_edits


def live_plotter(ax,words,x_loc,y_loc,color,size_input,rotation):
    if ax==[]:
        plt.ion()
        fig = plt.figure(figsize=(11,6))
        ax = fig.add_subplot(111,frameon=False)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        annot1 = []
        plt.show()
        return ax,annot1
    
    annot1 = ax.annotate(words,xy=(x_loc,y_loc),fontsize=size_input,xycoords='figure pixels',color=color,rotation=rotation)
    plt.pause(0.01)
    
    return ax,annot1

ax = []
font_pixel_density = 17 #16px per character
wiki_vals = wiki_crawl() 
ax,annot1 = live_plotter(ax,' ',0.0,0.0,'k',font_pixel_density,0.0)
fig = ax.get_figure()
fig_size = fig.get_dpi()*fig.get_size_inches()
cmap = matplotlib.cm.get_cmap('tab10')
while True:
    wiki_vals = wiki_crawl()
    
    for ii in range(0,len(wiki_vals)):
        
        annot_text = wiki_vals[ii]
        # find the size of the word in pixels
        annot_x_size = len(wiki_vals[ii])*font_pixel_density
        # randomize the location of the word
        zoom_ratio = 0.95
        x_loc = random.uniform(0.0+zoom_ratio,1.0-zoom_ratio)*(fig_size[0]-annot_x_size)
        y_loc = random.uniform(0.0+zoom_ratio,1.0-zoom_ratio)*(fig_size[1]-font_pixel_density)

        # randomize the color,rotation angle, and size of the word text
        color =  matplotlib.cm.colors.to_hex(cmap(np.random.rand(1))[0])
##        rotation = random.uniform(-1, 1)*30
        rotation = 0.0
        size_var = random.uniform(0.4,1)*font_pixel_density
        prev_children = ax.get_children()
        ax,annot1 = live_plotter(ax,annot_text,x_loc,y_loc,color,size_var,rotation)
        for ii in prev_children:            
            try:
                jj = (annot1.get_window_extent()).extents
                if jj[2]-ii.get_window_extent().extents[0]>=0 and ii.get_window_extent().extents[2]-jj[0]>=0 and\
                   jj[3]-ii.get_window_extent().extents[1]>=0 and ii.get_window_extent().extents[3]-jj[1]>=0:
                    ii.remove() 
                    
            except:
                pass

Conclusion

In this tutorial I demonstrated how to create a live word cloud using Python. As an example, I first scattered a Lorem Ipsum paragraph around a matplotlib figure to create a simple word cloud with words that vary in size, color, and rotation angle. Then, I used a real-world example that showed the user how to crawl Wikipedia's recent changes page and plot each Wiki title to a dynamic word cloud casting each recent change onto a figure full of multi-colored words. Using these examples and the code above, the user is capable of creating their own word cloud using any series of words or titles.

See More in Live Plotting and Python:

Featured