summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPacien TRAN-GIRARD2014-10-24 18:05:48 +0000
committerPacien TRAN-GIRARD2014-10-24 18:05:48 +0000
commit4403fda939ef42aeffeccb343d74f3dc3b840f91 (patch)
tree63fd704f15f3030f1455aad0ef92403c5d093c70
parent16529a0d212e1387eacd590c0e5e1b1a13dc2641 (diff)
parentbdf9099df8c2a4636b0ad0e710b73330877eef37 (diff)
downloadwikistats-4403fda939ef42aeffeccb343d74f3dc3b840f91.tar.gz
Merge branch 'refactor' into 'master'
Refactor See merge request !1
-rw-r--r--banapedia/Ban.py36
-rw-r--r--banapedia/__init__.py1
-rw-r--r--banapedia/api/__init__.py0
-rw-r--r--banapedia/wapi/WikipediaQuery.py42
-rw-r--r--banapedia/wapi/__init__.py0
-rw-r--r--bandict/__init__.py91
-rw-r--r--main.py101
-rw-r--r--rapport.md2
-rw-r--r--rapport.pdfbin86060 -> 0 bytes
-rw-r--r--sysproxy.py7
-rw-r--r--wikibania/__init__.py1
-rw-r--r--wikibania/api/Query.py (renamed from banapedia/api/Query.py)7
-rw-r--r--wikibania/api/__init__.py1
-rw-r--r--wikibania/ban/Ban.py37
-rw-r--r--wikibania/ban/BanDB.py50
-rw-r--r--wikibania/ban/BanDBWrapper.py25
-rw-r--r--wikibania/ban/__init__.py1
-rw-r--r--wikibania/wapi/WikipediaQuery.py48
-rw-r--r--wikibania/wapi/__init__.py1
19 files changed, 246 insertions, 205 deletions
diff --git a/banapedia/Ban.py b/banapedia/Ban.py
deleted file mode 100644
index 4714274..0000000
--- a/banapedia/Ban.py
+++ /dev/null
@@ -1,36 +0,0 @@
1from banapedia.wapi.WikipediaQuery import BlockQuery
2from datetime import datetime
3import pygeoip
4
5__author__ = 'pacien'
6
7
8GEOIP_FILE = "/usr/share/GeoIP/GeoIP.dat"
9geoip = pygeoip.GeoIP(GEOIP_FILE)
10
11ISO_TIMESTAMP = "%Y-%m-%dT%H:%M:%SZ"
12
13
14class Ban:
15 def __init__(self, ip, start, end):
16 self.ip = ip
17 self.start = start
18 self.end = end
19 self.country_code = None
20
21 def get_duration(self):
22 return (self.end - self.start).days
23
24 def get_country_code(self):
25 if self.country_code is not None:
26 return self.country_code
27
28 country_code = ""
29
30 try:
31 country_code = geoip.country_code_by_addr(self.ip).lower()
32 except pygeoip.GeoIPError:
33 print("[ERROR]", "Could not determine country for ip", self.ip)
34
35 self.country_code = country_code
36 return country_code
diff --git a/banapedia/__init__.py b/banapedia/__init__.py
deleted file mode 100644
index 8b13789..0000000
--- a/banapedia/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
1
diff --git a/banapedia/api/__init__.py b/banapedia/api/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/banapedia/api/__init__.py
+++ /dev/null
diff --git a/banapedia/wapi/WikipediaQuery.py b/banapedia/wapi/WikipediaQuery.py
deleted file mode 100644
index d3d2f94..0000000
--- a/banapedia/wapi/WikipediaQuery.py
+++ /dev/null
@@ -1,42 +0,0 @@
1from ..api.Query import JSONQuery
2
3__author__ = 'pacien'
4
5WIKIPEDIA_QUERY_BASE_URL = "https://en.wikipedia.org/w/api.php"
6LIST_SEPARATOR = "|"
7DEFAULT_BKPROP = ["id", "user", "userid", "by", "byid", "timestamp", "expiry", "reason", "range", "flags"]
8DEFAULT_BKSHOW = ["account", "temp", "ip", "range"]
9
10
11class WikipediaQuery(JSONQuery):
12 def __init__(self, params={}):
13 params.update({
14 "action": "query",
15 "format": "json",
16 })
17 JSONQuery.__init__(self, base_url=WIKIPEDIA_QUERY_BASE_URL, params=params)
18
19
20class ListQuery(WikipediaQuery):
21 def __init__(self, list_name, params={}):
22 params.update({
23 "list": list_name,
24 })
25 WikipediaQuery.__init__(self, params)
26
27
28class BlockQuery(ListQuery):
29 def __init__(self, bkprop=DEFAULT_BKPROP, bkshow=DEFAULT_BKSHOW, bkdir="newer", limit=500, continue_token=None):
30 params = {
31 "bkprop": LIST_SEPARATOR.join(bkprop),
32 "bkshow": LIST_SEPARATOR.join(bkshow),
33 "bkdir": bkdir,
34 "bklimit": limit,
35 }
36
37 if continue_token is not None:
38 params.update({"bkcontinue": continue_token})
39
40 ListQuery.__init__(self, "blocks", params=params)
41
42
diff --git a/banapedia/wapi/__init__.py b/banapedia/wapi/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/banapedia/wapi/__init__.py
+++ /dev/null
diff --git a/bandict/__init__.py b/bandict/__init__.py
deleted file mode 100644
index 389ae0e..0000000
--- a/bandict/__init__.py
+++ /dev/null
@@ -1,91 +0,0 @@
1import urllib.request
2import json
3import numpy as np
4from banapedia.wapi.WikipediaQuery import BlockQuery
5from banapedia.Ban import *
6
7
8class BanList():
9
10 def __init__(self, data_file, samples=30000, samples_by_query=500,
11 from_internet=False):
12 if from_internet:
13 proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
14 opener = urllib.request.build_opener(proxy)
15 urllib.request.install_opener(opener)
16 self.dict_list = self.fetch_multipart(samples, samples_by_query)
17 else:
18 with open(data_file, "r") as ban_dict_file:
19 self.dict_list = json.load(ban_dict_file)
20 self.ban_list = []
21 for ban_dict in self.dict_list:
22 self.ban_list.append(Ban(
23 ban_dict["user"],
24 datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP),
25 datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP),
26 ))
27
28 def fetch_multipart(self, n, query_limit):
29 ban_dict_list = []
30 n_fetched = 0
31 continue_token = None
32
33 print("[INFO]", "Fetching %d bans" % n)
34 while n_fetched < n:
35 to_fetch = min(query_limit, n - n_fetched)
36 query = BlockQuery(
37 bkprop=["user", "timestamp", "expiry"],
38 bkshow=["temp", "ip"],
39 limit=to_fetch,
40 continue_token=continue_token,
41 )
42 results = query.fetch_result()
43 ban_dict_list.extend(results["query"]["blocks"])
44 continue_token = results["query-continue"]["blocks"]["bkcontinue"]
45 n_fetched += to_fetch
46 print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n))
47
48 print("[INFO]", "Bans fetching complete")
49 return ban_dict_list
50
51 def write_to_file(self, outfile):
52 with open(outfile, "w") as ban_dict_file:
53 json.dump(self.dict_list, ban_dict_file, indent="\t")
54
55 def get_durations(self):
56 return [ban.get_duration() for ban in self.ban_list]
57
58 def get_countries(self):
59 return [ban.get_country_code() for ban in self.ban_list]
60
61 def __iter__(self):
62 return self.dict_list.__iter__()
63
64 def items(self):
65 return self.dict_list.items()
66
67 def by_country(self):
68 ban_duration_by_country = {}
69
70 for ban in self.ban_list:
71 country_code = ban.get_country_code()
72
73 if country_code not in ban_duration_by_country.keys():
74 ban_duration_by_country[country_code] = []
75
76 ban_duration_by_country[country_code].append(ban)
77
78 return ban_duration_by_country
79
80 def average_ban_by_country(self):
81 average_ban_duration_ctry = {}
82
83 for country, bans in self.by_country().items():
84 average = np.mean([ban.get_duration() for ban in bans])
85 average_ban_duration_ctry[country] = average
86
87 # In months
88 average_ban_duration_ctry = {country: duration/30
89 for country, duration in
90 average_ban_duration_ctry.items()}
91 return average_ban_duration_ctry
diff --git a/main.py b/main.py
index 0d12a1e..da7c6ae 100644
--- a/main.py
+++ b/main.py
@@ -1,62 +1,101 @@
1from banapedia.Ban import *
2import bandict
3from collections import Counter 1from collections import Counter
2import webbrowser
3
4import pygal 4import pygal
5import numpy as np 5import numpy as np
6import pygeoip
7
8from wikibania.ban.BanDB import BanDB
9from wikibania.ban.BanDBWrapper import BanDBWrapper
10import sysproxy
11
12
13# PARAMS
14
15GEOIP_DB = "/usr/share/GeoIP/GeoIP.dat"
6 16
7BAN_MAP_FILE = "output/ban-map.svg" 17BAN_MAP_FILE = "output/ban-map.svg"
8BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg" 18BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg"
9HIST_FILE = "output/histogram.svg" 19HIST_FILE = "output/histogram.svg"
20STATS_FILE = "output/stats.txt"
21
22BAN_DB_FILE = "resources/ban_list.json"
23
24FETCH_SAMPLES = 2000
25FETCH_DB = False
26DUMP_DB = False
27LOAD_DB = False
28
29OPEN_FILES = False
30
31