-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathlist.py
More file actions
34 lines (29 loc) · 1.17 KB
/
Copy pathlist.py
File metadata and controls
34 lines (29 loc) · 1.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""
Data Extractor
This script extracts data present in an html table on a webpage
with the url of the page know and a queryselect of the table know
either the class name or the id of the table.
"""
import urllib
from bs4 import BeautifulSoup
import pandas as pd
# pylint: disable=C0103
url = 'http://en.wikipedia.org/wiki/List_of_Presidents_of_the_United_States'
request = urllib.request.Request(url)
opener = urllib.request.build_opener()
response = opener.open(request)
soup = BeautifulSoup(response, "lxml")
table = soup.select_one("table.wikitable")
pres = []
body = [[td.text for td in row.find_all("td")] for row in table.select("tr + tr")]
for pos in range(0, len(body)):
if len(body[pos]) > 3 and pos <= 80:
_pstart = (body[pos][1].replace("\n", " ").split("(")[0]).split(" – ")[0]
_pend = (body[pos][1].replace("\n", " ").split("(")[0]).split(" – ")[1]
pres.append({
"President" : str(" ".join(body[pos][3].replace("\n", " ").split(" ")[:3])),
"Start" : str(_pstart),
"End" : str(_pend),
"Party" : str(body[pos][6].replace("\n", ""))})
dataset = pd.DataFrame(pres)
dataset.to_excel("presidents.xlsx")