Edinburgh Bike Open Data – 1 of 4 – data acquisition

As a keen cyclist I thought I would take a look at Edinburgh Council’s Bike Counter dataset. The website states that “The dataset includes bike counts collected on a hourly-basis between 2007 and 2016, from 48 off-road and on-road counters installed in Edinburgh” which seems like an interesting period of time to examine for possible changes in usage patterns.

The first challenge is to acquire the data, each counter’s data is stored in a separate CSV file on a separate page. Lets use Beautiful Soup to retrieve it. First lets import some libraries


# import libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns # NB needs Seaborn 0.9.0
import lxml
from bs4 import BeautifulSoup as soup
import sys
import os
import urllib

# enable inline printing
%matplotlib inline

# set some address variables
directory = r"C:\Users\Justin\Documents\Python Scripts\Blog"
CSV_directory = r"C:\Users\Justin\Documents\Python Scripts\Blog\CSVs"

# initialise the program path 
os.chdir(directory)

Now lets grab the main page for the dataset so that we can parse it


# grab the lead page of the survey from:
# survey: https://data.edinburghopendata.info/dataset/bike-counter-data-set-cluster
# license: http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/

main_URL = r"https://data.edinburghopendata.info/dataset/bike-counter-data-set-cluster"
main_URL = r"https://data.edinburghopendata.info/dataset/bike-counter-data-set-cluster/resource/c1014c68-3fdf-4fb7-8318-7f0d6b31286d"
URL_list = []

# grab the parent page
request = urllib.request.Request(main_URL)
opened = urllib.request.urlopen(request)
main_HTML = opened.read()
opened.close()

# Convert HTML to soup for parsing
soup_main = soup(main_HTML, "html.parser")

Examining the source code of the main page we can see that the links for the required pages are ‘href’ links inside ‘a’ tags which we can easily retrieve from our soup object.  After retrieving them we can match them against the pattern we observed for a valid target URL and store our results


# Now lets get a list of the pages containing the CSVs to download
link_list = []
for link in soup_main.find_all('a'):
    # extract the relative link
    link_text = link.get('href')
    # check our reyruns are valid string
    if isinstance(link_text, str):
        # and that they are the kind of link we need
        if (link_text[:48] == "/dataset/bike-counter-data-set-cluster/resource/"): 
            link_list += ['https://data.edinburghopendata.info' + link_text]

Unfortunately these forward links lead to pages containing the CSVs not the CSVs themselves so we need to repeat the process, this time extracting the locations of the actual CSV

# now lets cycle over these addresses and get the URLs for the data
CSV_page_list = []

# cycle over our previously obtained links
for link in link_list:
    request = urllib.request.Request(link)
    opened = urllib.request.urlopen(request)
    CSV_page = opened.read()
    opened.close()
    # Convert HTML to soup for parsing
    soup_page = soup(CSV_page, "html.parser")
    # extract a list of all the 'a' tags
    tag_list = soup_page.find_all('a')
    # cycle over the tag list for the current page
    for tag in tag_list:
        link_class = tag.get('class')
        # the tags we want are handed back as a list, filter any oddities
        if isinstance(link_class, list):
            #find the link we want
            if link_class == ["resource-url-analytics"]:
                # and add it to our page list
                CSV_page_list += [tag.get('href')]

# remove doubles
CSV_page_list = list(set(CSV_page_list))

Now we can download the CSVs. Its advisable to save them locally to reduce the number of requests being made of the online datastore and to speed up reloading the dataset. After downloading a local copy, lets create a dictionary to hold them. A dictionary makes it easier to keep track of our different dataframes by using the Bike Counter IDs as the dictionary keys

 
# now lets retrieve these CSVs and save locally
os.chdir(CSV_directory)
for CSV_link in CSV_page_list[0:]:
    # read CSV from the web
    df_temp = pd.read_csv(CSV_link, error_bad_lines=False)
    # save a copy of the CSV locally
    df_temp.to_csv(CSV_link[136:], index= False)

# lets load our data from disc into a dictionary
os.chdir(CSV_directory)

df_dict = {}
# cycle through the directory looking for CSVs
for file in os.listdir():
    if file.endswith(".csv"):
        # load the CSV as a dataframe and add it to the dictionary
        data = pd.read_csv(file, error_bad_lines=False)
        # get the key from the filename
        CSV_number = int(file[0:2])
        df_dict[CSV_number] = data

Now our data is loaded, the next step will be to clean the data.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.