Skip to main content

Create SQLite DB from Wikipedia

Create an SQLite DB based on the British Monarch Family Tree.
https://en.wikipedia.org/w/index.php?title=Family_tree_of_British_monarchs&oldid=1043575587 

## set up tables: british_monarch_family_tree
import sqlite3
conn = sqlite3.connect('british_monarch_family_tree.db')

cur = conn.cursor()

cur.execute('DROP TABLE IF EXISTS british_monarch_family_tree ')
cur.execute('CREATE TABLE british_monarch_family_tree ('\
            'id INTEGER PRIMARY KEY AUTOINCREMENT, '\
            'name TEXT, '\
            'wiki_url TEXT)')

import requests
from bs4 import BeautifulSoup

# Fetch the page content from the url
user_agent = {'User-agent': 'Mozilla/5.0'}
url = 'https://en.wikipedia.org/w/index.php?title=Family_tree_of_British_monarchs&oldid=1043575587'
page = requests.get(url, headers = user_agent)
soup = BeautifulSoup(page.content)

conn = sqlite3.connect('british_monarch_family_tree.db')
cur = conn.cursor()

for tr in soup.find('table').find_all('td'):
    link = tr.find('a')
    if link and link.text != '' and 'House' not in link.text and 'House' not in link.attrs['href'] and 'History' not in link.attrs['href']:
        cur.execute('INSERT INTO british_monarch_family_tree (name, wiki_url) VALUES (?, ?)',
                    (link.get_text(separator=" ").strip(), link.attrs['href']))
conn.commit()
conn.close()

Adding foreign key, Mother and Father.

# Alter table to add columns with foreign key
conn = sqlite3.connect('british_monarch_family_tree.db')
cur = conn.cursor()
cur.execute('ALTER TABLE british_monarch_family_tree '\
            'ADD COLUMN father_id INTEGER DEFAULT NULL '\
            'REFERENCES british_monarch_family_tree(id)')
cur.execute('ALTER TABLE british_monarch_family_tree '\
            'ADD COLUMN mother_id INTEGER DEFAULT NULL '\
            'REFERENCES british_monarch_family_tree(id)')
conn.close()

# Fetch all from table
user_agent = {'User-agent': 'Mozilla/5.0'}

conn = sqlite3.connect('british_monarch_family_tree.db')
cur = conn.cursor()
cur.execute('SELECT COUNT(*) from british_monarch_family_tree')
table_size = cur.fetchone()[0]

#for c in range(1, table_size+1):
c = 1
while(c <= table_size):
    cur.execute("SELECT id, wiki_url from british_monarch_family_tree WHERE id = ?", (c,))
    r = cur.fetchone()
    
    # Navigate to each wiki_url https://en.wikipedia.org+wiki_url
    if r[1] == '':
        pass
    
    url = 'https://en.wikipedia.org' + r[1]
    page = requests.get(url, headers = user_agent)
    soup = BeautifulSoup(page.content)
    
    # find the father mother
    th_content = soup.find_all(class_="infobox-label")
    td_content = soup.find_all(class_="infobox-data")
    for i in range(0,len(th_content)):
        if th_content[i].text == 'Father' or th_content[i].text == 'Mother':
            link_tag = td_content[i].find('a')
            if link_tag is None:
                link = ''
            else:
                link = link_tag.attrs['href']
            name = td_content[i].get_text(separator=" ").strip()
            # Check db for exisiting record of parent and get id
            cur.execute("SELECT id from british_monarch_family_tree WHERE wiki_url LIKE ? OR name LIKE ?", 
                        (link,name))
            # Add to db if missing
            if cur.fetchone() is None: 
                cur.execute('INSERT INTO british_monarch_family_tree (name, wiki_url) VALUES (?, ?)',
                            (name, link))
                conn.commit()
                
                # Question did not state to find the parents of the parents
                # If true,row count will be 1000 == DatabaseError: database disk image is malformed
                # table_size += 1 
                
            # Get the father id
            cur.execute("SELECT id from british_monarch_family_tree WHERE wiki_url LIKE ? OR name LIKE ?", 
                        (link,name))
            
            # Update record with parents
            parent_id = cur.fetchone()
            if th_content[i].text == 'Father':
                cur.execute("UPDATE british_monarch_family_tree SET father_id = ? where id=?", (parent_id[0],r[0]))
            elif th_content[i].text == 'Mother':
                cur.execute("UPDATE british_monarch_family_tree SET mother_id = ? where id=?", (parent_id[0],r[0]))
    c+=1
conn.commit()
conn.close()

 Find all children of King "George III":

conn = sqlite3.connect('british_monarch_family_tree.db')
cur = conn.cursor()
cur.execute('SELECT * from british_monarch_family_tree b1 '\
            'WHERE father_id = (SELECT id from british_monarch_family_tree '\
            'WHERE name = "George III")')
result = cur.fetchall()
conn.close()
for r in result:
    print (r)

Output:

(85, 'George IV', '/wiki/George_IV_of_the_United_Kingdom', 84, 176)
(86, 'William IV', '/wiki/William_IV_of_the_United_Kingdom', 84, 176)
(87, 'Edward Duke of Kent and Strathearn', '/wiki/Prince_Edward,_Duke_of_Kent_and_Strathearn', 84, 176)

Find the father and mother of King "George III":

conn = sqlite3.connect('british_monarch_family_tree.db')
cur = conn.cursor()
cur.execute('Select * from british_monarch_family_tree '\
            'WHERE id = (SELECT father_id from british_monarch_family_tree WHERE name = "George III") '\
            'OR id = (SELECT mother_id from british_monarch_family_tree WHERE name = "George III")')
result = cur.fetchall()
conn.close()
for r in result:
    print (r)

Output:

(83, 'Frederick Prince of Wales', '/wiki/Frederick,_Prince_of_Wales', 82, 174)
(175, 'Princess Augusta of Saxe-Gotha', '/wiki/Princess_Augusta_of_Saxe-Gotha', None, None)

Find all descendants of "Queen Victoria":

# Load db table into df
conn = sqlite3.connect('british_monarch_family_tree.db')
cur = conn.cursor()
cur.execute('SELECT * from british_monarch_family_tree')
result = cur.fetchall()
conn.close()

df_db = pd.DataFrame(result,
                     columns=['id', 'name', 'wiki_url', 'father_id', 'mother_id'])
df_db = df_db.set_index('id')

# find queen victoria id
root_id = df_db.loc[df_db['name'] == 'Victoria'].index[0]

# make a temp list of descendants to hold index
descendants = []

# insert victoria id into the list index 0
descendants.append(root_id)

# while loop to keep looping till no more descendant
# for loop to fetch new descendants
index = 0
while(index < len(descendants)):
    for id in df_db.loc[(df_db['father_id'] == descendants[index]) | 
                        (df_db['mother_id'] == descendants[index])].index:
        if id not in descendants:
            descendants.append(id)
    index+=1

# print df and skip first index 0 victoria
df_db.iloc[[i-1 for i in descendants[1::]]]

Output in Dataframe:

idnamewiki_urlfather_idmother_id
89Edward VII/wiki/Edward_VII178.088.0
90George V/wiki/George_V89.0179.0
91Edward VIII/wiki/Edward_VIII90.0180.0
92George VI/wiki/George_VI90.0180.0
93Elizabeth II/wiki/Elizabeth_II92.0181.0