summaryrefslogtreecommitdiff
path: root/src/downloader/__init__.py
blob: 15fba41855703decce55d662718e9bb81f5b58da (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
Init file of the downloader module.

The downloader module is used to take care of the downloading part of the
program, including manipulation of the wikimedia API.
"""

import urllib.request
import urllib.parse


class Downloader():
    """Class used to download a given webpage considering system proxy"""
    def __init__(self):
        """ Downloader class constructor """
        self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
        self.opener = urllib.request.build_opener(self.proxy)
        urllib.request.install_opener(self.opener)

    def download(self, url):
        """ Download the given URL and return the source code """
        return urllib.request.urlopen(url).read().decode("utf8")

    def download_in_file(self, url, output_file_path):
        """ Download the given URL and write to the given file """
        with open(output_file_path, "w") as output_file:
            output_file.write(self.download(url))


class WikimediaAPI():
    """
    Class used to generate wikimedia API urls for several uses
    """
    def __init__(self, endpoint="http://en.wikipedia.org/w/api.php",
                 return_format="json"):
        """
        WikimediaAPI class constructor

        The endpoint for this project should be
        "http://en.wikipedia.org/w/api.php" but it can be any other wiki
        api endpoint made with the Wikimedia software.

        The return_format can be one of json, php, wddx, xml, yaml, raw, txt,
        dbg, dump or none.
        """
        self.endpoint = endpoint
        self.return_format = return_format

    def get_recent_changes(self, namespace="(Main)"):
        """
        Get the url corresponding to the latest changes made to the wiki.
        (https://www.mediawiki.org/wiki/API:Recentchanges)

        The namespace is used to restrict the results to a certain level. It
        can be (Main) which is the default one, "Wikipedia", "File" or
        others. It will be converted to an int corresponding to the rcnamespace
        parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
        """
        rcnamespaces = {
            "(Main)": "0",
            "Talk": "1",
            "User talk": "2",
            "Wikipedia": "3",
            "Wikipedia talk": "4",
            "File": "5",
            "File talk": "6",
            "MediaWiki": "7",
            "MediaWiki talk": "8",
            "Template": "9",
            "Template talk": "10",
            "Help": "11",
            "Help talk": "12",
            "Category": "13",
            "Category talk": "14",
            # Custom Wikipedia namespaces
            "Portal": "100",
            "Portal talk": "101",
            "Book": "108",
            "Book talk": "109",
            "Draft": "118",
            "Draft talk": "119",
            "Education Program": "446",
            "Education Program talk": "447",
            "TimedText": "710",
            "TimedText talk": "711",
            "Module": "828",
            "Module talk": "829",
            "Topic": "2600"
        }

        url_params = {
            "action": "query",
            "list": "recentchanges",
            "format": self.return_format,
            "rcnamespace": rcnamespaces[namespace],
        }
        url_params_str = urllib.parse.urlencode(url_params)
        return urllib.parse.urljoin(self.endpoint, "?" + url_params_str)