""" Init file of the downloader module. The downloader module is used to take care of the downloading part of the program, including manipulation of the wikimedia API. """ import urllib.request import urllib.parse import json class Downloader(): """Class used to download a given webpage considering system proxy""" def __init__(self): """ Downloader class constructor """ self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) self.opener = urllib.request.build_opener(self.proxy) urllib.request.install_opener(self.opener) self.data_types = { "revisions": "rv", "recentchanges": "rc", "contributors": "pc", "geosearch": "gs", } def download(self, endpoint, data_type, params={}, limit=0): """ Download the given URL with GET parameters and return the source code with rccontinue support. If limit equals 0, then it suppose that the limit is given whithin the paramaters """ if data_type in self.data_types.keys(): prefix = self.data_types[data_type] limit_name = prefix + "limit" continue_name = prefix + "continue" if limit == 0: limit = params[limit_name] else: params[limit_name] = limit if limit <= 500: url = self.compile_url(endpoint, params) result = urllib.request.urlopen(url).read().decode("utf8") return [json.loads(result)] else: results = [] # Each 500 limits while limit > 0: # Support for numbers like 1542 if limit > 500: limit -= 500 temp_limit = 500 else: temp_limit = limit limit = 0 temp_result = self.download(endpoint, data_type, params, temp_limit) if "query-continue" in temp_result[0].keys(): params[continue_name] = temp_result[0]["query-continue"][data_type][continue_name] else: limit = 0 results.append(temp_result[0]) return results def download_in_file(self, url, output_file_path): """ Download the given URL and write to the given file """ with open(output_file_path, "w") as output_file: output_file.write(self.download(url)) def compile_url(self, endpoint, params={}): url_params_str = urllib.parse.urlencode(params) return urllib.parse.urljoin(endpoint, "?" + url_params_str) class WikimediaAPI(): """ Class used to generate wikimedia API urls for several uses """ def __init__(self, endpoint="http://en.wikipedia.org/w/api.php", return_format="json"): """ WikimediaAPI class constructor The endpoint for this project should be "http://en.wikipedia.org/w/api.php" but it can be any other wiki api endpoint made with the Wikimedia software. The return_format can be one of json, php, wddx, xml, yaml, raw, txt, dbg, dump or none. """ self.endpoint = endpoint self.return_format = return_format self.namespaces = { "(Main)": "0", "Talk": "1", "User talk": "2", "Wikipedia": "3", "Wikipedia talk": "4", "File": "5", "File talk": "6", "MediaWiki": "7", "MediaWiki talk": "8", "Template": "9", "Template talk": "10", "Help": "11", "Help talk": "12", "Category": "13", "Category talk": "14", # Custom Wikipedia namespaces "Portal": "100", "Portal talk": "101", "Book": "108", "Book talk": "109", "Draft": "118", "Draft talk": "119", "Education Program": "446", "Education Program talk": "447", "TimedText": "710", "TimedText talk": "711", "Module": "828", "Module talk": "829", "Topic": "2600" } def get_recent_changes(self, namespace="(Main)"): """ Get the url corresponding to the latest changes made to the wiki. (https://www.mediawiki.org/wiki/API:Recentchanges) The namespace is used to restrict the results to a certain level. It can be (Main) which is the default one, "Wikipedia", "File" or others. It will be converted to an int corresponding to the rcnamespace parameter. See https://meta.wikimedia.org/wiki/Help:Namespace """ url_params = { "action": "query", "list": "recentchanges", "format": self.return_format, "rcnamespace": self.namespaces[namespace], } return self.endpoint, url_params def get_contributors(self, page="Main_Page", namespace="(Main)"): """ Get the url corresponding to the contributors of a given page or list of pages. (https://www.mediawiki.org/wiki/API:Properties#contributors_.2F_pc) Use the 'page' parameter to specify the Wikipedia page(s) The namespace is used to restrict the results to a certain level. It can be (Main) which is the default one, "Wikipedia", "File" or others. It will be converted to an int corresponding to the pcnamespace parameter. See https://meta.wikimedia.org/wiki/Help:Namespace """ url_params = { "action": "query", "prop": "contributors", "format": self.return_format, "titles": page, } return self.endpoint, url_params def get_watchers(self, page="Main_Page"): """ Get the url corresponding to the list of contributors of a given page. (https://www.mediawiki.org/wiki/API:Properties#info_.2F_in) Use the 'page' parameter to specify the Wikipedia page(s) """ url_params = { "action": "query", "prop": "info", "format": self.return_format, "inprop": "watchers", "titles": page, } return self.endpoint, url_params def get_revisions(self, page="Main_Page"): """ Get the url corresponding to the list of revisions of a given page. (https://www.mediawiki.org/wiki/API:Properties#revisions_.2F_rv) Use the 'page' parameter to specify the Wikipedia page(s) The namespace is used to restrict the results to a certain level. It can be (Main) which is the default one, "Wikipedia", "File" or others. It will be converted to an int corresponding to the pcnamespace parameter. See https://meta.wikimedia.org/wiki/Help:Namespace """ url_params = { "action": "query", "prop": "revisions", "format": self.return_format, "rvprop": "ids", "titles": page, } return self.endpoint, url_params def get_pages_around(self, location="48.8567|2.3508", radius=10000, namespace="(Main)"): """ Get the url corresponding to the pages around coordinates which by default are the coordinates of paris. (https://www.mediawiki.org/wiki/Extension:GeoData#list.3Dgeosearch) Use the 'location' parameter to specify the coordinates The namespace is used to restrict the results to a certain level. It can be (Main) which is the default one, "Wikipedia", "File" or others. It will be converted to an int corresponding to the pcnamespace parameter. See https://meta.wikimedia.org/wiki/Help:Namespace """ url_params = { "action": "query", "list": "geosearch", "format": self.return_format, "gscoord": location, "gsradius": radius, } return self.endpoint, url_params