Skip to content

Commit cca592b

Browse files
committed
Adding Xkcdcomic Scraper
1 parent 2f18733 commit cca592b

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed

Web-Scraping-Projects/downloadXkcd.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#! python3
2+
# downloadXkcd.py - Downloads every single XKCD comic.
3+
# Importing all the necessary libraries
4+
import requests
5+
import os
6+
import bs4
7+
8+
url = 'http://xkcd.com' # starting url
9+
directory = 'xkcd_comics'
10+
parent_dir = ".\Web-Scraping-Projects"
11+
path = os.path.join(parent_dir, directory)
12+
try:
13+
os.makedirs(path, exist_ok=True) # store comics in ./xkcd_comics
14+
print(f"Directory {directory} created successfully.")
15+
except OSError as error:
16+
print(f"Directory {error} can not be created.")
17+
18+
while not url.endswith('#'):
19+
# TODO: Download the page.
20+
# save the html data for the given webpage
21+
res = requests.get(url)
22+
# Handling all the exceptions (If encountered):
23+
try:
24+
res.raise_for_status()
25+
# good practice to ensure that the program stops if a bad download occurs.
26+
except Exception as err:
27+
print(f'There was a problem: {err}')
28+
# creating the beautifulsoup4 object from the xkcd webpage stored in the requests response object
29+
xkcdsoup = bs4.BeautifulSoup(res.text)
30+
# TODO: Find the URL of the comic image.
31+
# extracts: All elements named <img> within an `id` attribute of `comic`
32+
# Can be found by using the inspect element.
33+
comicElem = xkcdsoup.select('#comic img')
34+
if comicElem == []:
35+
print('Could not find the comic image.')
36+
else:
37+
try:
38+
comicUrl = 'http:' + comicElem[0].get('src') # get the image url.
39+
# Download the image.
40+
print(f"Downloading the image {comicUrl}")
41+
res = requests.get(comicUrl)
42+
res.raise_for_status()
43+
except requests.exceptions.MissingSchema:
44+
# Skip this comic
45+
prevlink = xkcdsoup.select('a[rel="prev"]')[0]
46+
url = 'http://xkcd.com' + prevlink.get('href')
47+
continue
48+
# TODO: Download the image.
49+
imageFile = open(os.path.join(path, os.path.basename(comicUrl)), 'wb')
50+
for chunk in res.iter_content(100000):
51+
imageFile.write(chunk)
52+
imageFile.close()
53+
# TODO: Save the image to ./xkcd_comics
54+
55+
# TODO: Get the Prev button's url.
56+
prevLink = xkcdsoup.select('a[rel="prev"]')[0]
57+
url = 'http://xkcd.com' + prevLink.get('href')
58+
59+
print('Done.')

0 commit comments

Comments
 (0)