1
+ #! python3
2
+ # downloadXkcd.py - Downloads every single XKCD comic.
3
+ # Importing all the necessary libraries
4
+ import requests
5
+ import os
6
+ import bs4
7
+
8
+ url = 'http://xkcd.com' # starting url
9
+ directory = 'xkcd_comics'
10
+ parent_dir = ".\Web-Scraping-Projects"
11
+ path = os .path .join (parent_dir , directory )
12
+ try :
13
+ os .makedirs (path , exist_ok = True ) # store comics in ./xkcd_comics
14
+ print (f"Directory { directory } created successfully." )
15
+ except OSError as error :
16
+ print (f"Directory { error } can not be created." )
17
+
18
+ while not url .endswith ('#' ):
19
+ # TODO: Download the page.
20
+ # save the html data for the given webpage
21
+ res = requests .get (url )
22
+ # Handling all the exceptions (If encountered):
23
+ try :
24
+ res .raise_for_status ()
25
+ # good practice to ensure that the program stops if a bad download occurs.
26
+ except Exception as err :
27
+ print (f'There was a problem: { err } ' )
28
+ # creating the beautifulsoup4 object from the xkcd webpage stored in the requests response object
29
+ xkcdsoup = bs4 .BeautifulSoup (res .text )
30
+ # TODO: Find the URL of the comic image.
31
+ # extracts: All elements named <img> within an `id` attribute of `comic`
32
+ # Can be found by using the inspect element.
33
+ comicElem = xkcdsoup .select ('#comic img' )
34
+ if comicElem == []:
35
+ print ('Could not find the comic image.' )
36
+ else :
37
+ try :
38
+ comicUrl = 'http:' + comicElem [0 ].get ('src' ) # get the image url.
39
+ # Download the image.
40
+ print (f"Downloading the image { comicUrl } " )
41
+ res = requests .get (comicUrl )
42
+ res .raise_for_status ()
43
+ except requests .exceptions .MissingSchema :
44
+ # Skip this comic
45
+ prevlink = xkcdsoup .select ('a[rel="prev"]' )[0 ]
46
+ url = 'http://xkcd.com' + prevlink .get ('href' )
47
+ continue
48
+ # TODO: Download the image.
49
+ imageFile = open (os .path .join (path , os .path .basename (comicUrl )), 'wb' )
50
+ for chunk in res .iter_content (100000 ):
51
+ imageFile .write (chunk )
52
+ imageFile .close ()
53
+ # TODO: Save the image to ./xkcd_comics
54
+
55
+ # TODO: Get the Prev button's url.
56
+ prevLink = xkcdsoup .select ('a[rel="prev"]' )[0 ]
57
+ url = 'http://xkcd.com' + prevLink .get ('href' )
58
+
59
+ print ('Done.' )
0 commit comments