summaryrefslogtreecommitdiff
path: root/src/downloader/__init__.py
blob: 831ea064f7561c30625987ebcacd222b56339c9a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
"""
Init file of the downloader module.

The downloader module is used to take care of the downloading part of the
program, including manipulation of the wikimedia API.
"""

import urllib.request
import urllib.parse
import json


class Downloader():
    """Class used to download a given webpage considering system proxy"""
    def __init__(self):
        """ Downloader class constructor """
        self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
        self.opener = urllib.request.build_opener(self.proxy)
        urllib.request.install_opener(self.opener)
        self.data_types = {
            "revisions": "rv",
            "recentchanges": "rc",
            "contributors": "pc",
            "geosearch": "gs",
        }

    def download(self, endpoint, data_type, params={}, limit=0):
        """
        Download the given URL with GET parameters and return the source code
        with rccontinue support. If limit equals 0, then it suppose that the
        limit is given whithin the paramaters
        """
        if data_type in self.data_types.keys():
            prefix = self.data_types[data_type]
            limit_name = prefix + "limit"
            continue_name = prefix + "continue"

            if limit == 0:
                limit = params[limit_name]
            else:
                params[limit_name] = limit

        if limit <= 500:
            url = self.compile_url(endpoint, params)
            result = urllib.request.urlopen(url).read().decode("utf8")
            return [json.loads(result)]
        else:
            results = []
            # Each 500 limits
            while limit > 0:
                # Support for numbers like 1542
                if limit > 500:
                    limit -= 500
                    temp_limit = 500
                else:
                    temp_limit = limit
                    limit = 0

                temp_result = self.download(endpoint, data_type, params, temp_limit)
                if "query-continue" in temp_result[0].keys():
                    params[continue_name] = temp_result[0]["query-continue"][data_type][continue_name]
                else:
                    limit = 0
                results.append(temp_result[0])
            return results

    def download_in_file(self, url, output_file_path):
        """ Download the given URL and write to the given file """
        with open(output_file_path, "w") as output_file:
            output_file.write(self.download(url))

    def compile_url(self, endpoint, params={}):
        url_params_str = urllib.parse.urlencode(params)
        return urllib.parse.urljoin(endpoint, "?" + url_params_str)


class WikimediaAPI():
    """
    Class used to generate wikimedia API urls for several uses
    """
    def __init__(self, endpoint="http://en.wikipedia.org/w/api.php",
                 return_format="json"):
        """
        WikimediaAPI class constructor

        The endpoint for this project should be
        "http://en.wikipedia.org/w/api.php" but it can be any other wiki
        api endpoint made with the Wikimedia software.

        The return_format can be one of json, php, wddx, xml, yaml, raw, txt,
        dbg, dump or none.
        """
        self.endpoint = endpoint
        self.return_format = return_format
        self.namespaces = {
            "(Main)": "0",
            "Talk": "1",
            "User talk": "2",
            "Wikipedia": "3",
            "Wikipedia talk": "4",
            "File": "5",
            "File talk": "6",
            "MediaWiki": "7",
            "MediaWiki talk": "8",
            "Template": "9",
            "Template talk": "10",
            "Help": "11",
            "Help talk": "12",
            "Category": "13",
            "Category talk": "14",
            # Custom Wikipedia namespaces
            "Portal": "100",
            "Portal talk": "101",
            "Book": "108",
            "Book talk": "109",
            "Draft": "118",
            "Draft talk": "119",
            "Education Program": "446",
            "Education Program talk": "447",
            "TimedText": "710",
            "TimedText talk": "711",
            "Module": "828",
            "Module talk": "829",
            "Topic": "2600"
        }

    def get_recent_changes(self, namespace="(Main)"):
        """
        Get the url corresponding to the latest changes made to the wiki.
        (https://www.mediawiki.org/wiki/API:Recentchanges)

        The namespace is used to restrict the results to a certain level. It
        can be (Main) which is the default one, "Wikipedia", "File" or
        others. It will be converted to an int corresponding to the rcnamespace
        parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
        """

        url_params = {
            "action": "query",
            "list": "recentchanges",
            "format": self.return_format,
            "rcnamespace": self.namespaces[namespace],
        }
        return self.endpoint, url_params

    def get_contributors(self, page="Main_Page", namespace="(Main)"):
        """
        Get the url corresponding to the contributors of a given page or list
        of pages.
        (https://www.mediawiki.org/wiki/API:Properties#contributors_.2F_pc)

        Use the 'page' parameter to specify the Wikipedia page(s)

        The namespace is used to restrict the results to a certain level. It
        can be (Main) which is the default one, "Wikipedia", "File" or
        others. It will be converted to an int corresponding to the pcnamespace
        parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
        """

        url_params = {
            "action": "query",
            "prop": "contributors",
            "format": self.return_format,
            "titles": page,
        }
        return self.endpoint, url_params

    def get_watchers(self, page="Main_Page"):
        """
        Get the url corresponding to the list of contributors of a given page.
        (https://www.mediawiki.org/wiki/API:Properties#info_.2F_in)

        Use the 'page' parameter to specify the Wikipedia page(s)
        """

        url_params = {
            "action": "query",
            "prop": "info",
            "format": self.return_format,
            "inprop": "watchers",
            "titles": page,
        }
        return self.endpoint, url_params

    def get_revisions(self, page="Main_Page"):
        """
        Get the url corresponding to the list of revisions of a given page.
        (https://www.mediawiki.org/wiki/API:Properties#revisions_.2F_rv)

        Use the 'page' parameter to specify the Wikipedia page(s)

        The namespace is used to restrict the results to a certain level. It
        can be (Main) which is the default one, "Wikipedia", "File" or
        others. It will be converted to an int corresponding to the pcnamespace
        parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
        """

        url_params = {
            "action": "query",
            "prop": "revisions",
            "format": self.return_format,
            "rvprop": "ids",
            "titles": page,
        }
        return self.endpoint, url_params

    def get_pages_around(self, location="48.8567|2.3508",
                         radius=10000, namespace="(Main)"):
        """
        Get the url corresponding to the pages around coordinates which by
        default are the coordinates of paris.
        (https://www.mediawiki.org/wiki/Extension:GeoData#list.3Dgeosearch)

        Use the 'location' parameter to specify the coordinates

        The namespace is used to restrict the results to a certain level. It
        can be (Main) which is the default one, "Wikipedia", "File" or
        others. It will be converted to an int corresponding to the pcnamespace
        parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
        """

        url_params = {
            "action": "query",
            "list": "geosearch",
            "format": self.return_format,
            "gscoord": location,
            "gsradius": radius,
        }
        return self.endpoint, url_params