summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMinijackson2014-10-24 10:20:14 +0200
committerMinijackson2014-10-24 10:20:14 +0200
commita95864928cd74df172f172d7c18b4d89e82548e1 (patch)
tree7a2632e9f5172f36187995d1521228bb5e4e45d8
parenta63aafe88b55c1c500646f5c93e9ff16780d37ca (diff)
downloadwikistats-a95864928cd74df172f172d7c18b4d89e82548e1.tar.gz
Moving to object type ban dict
-rw-r--r--banapedia/Ban.py40
-rw-r--r--bandict/__init__.py91
-rw-r--r--main.py83
3 files changed, 101 insertions, 113 deletions
diff --git a/banapedia/Ban.py b/banapedia/Ban.py
index d8666b4..4714274 100644
--- a/banapedia/Ban.py
+++ b/banapedia/Ban.py
@@ -34,43 +34,3 @@ class Ban:
34 34
35 self.country_code = country_code 35 self.country_code = country_code
36 return country_code 36 return country_code
37
38
39def map_ban(ban_dict):
40 return Ban(
41 ban_dict["user"],
42 datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP),
43 datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP),
44 )
45
46
47def map_bans(ban_dict_list):
48 ban_list = []
49 for ban_dict in ban_dict_list:
50 ban_list.append(map_ban(ban_dict))
51
52 return ban_list
53
54
55def fetch_multipart_ban_dict(n, query_limit):
56 ban_dict_list = []
57 n_fetched = 0
58 continue_token = None
59
60 print("[INFO]", "Fetching %d bans" % n)
61 while n_fetched < n:
62 to_fetch = min(query_limit, n - n_fetched)
63 query = BlockQuery(
64 bkprop=["user", "timestamp", "expiry"],
65 bkshow=["temp", "ip"],
66 limit=to_fetch,
67 continue_token=continue_token,
68 )
69 results = query.fetch_result()
70 ban_dict_list.extend(results["query"]["blocks"])
71 continue_token = results["query-continue"]["blocks"]["bkcontinue"]
72 n_fetched += to_fetch
73 print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n))
74
75 print("[INFO]", "Bans fetching complete")
76 return ban_dict_list
diff --git a/bandict/__init__.py b/bandict/__init__.py
new file mode 100644
index 0000000..389ae0e
--- /dev/null
+++ b/bandict/__init__.py
@@ -0,0 +1,91 @@
1import urllib.request
2import json
3import numpy as np
4from banapedia.wapi.WikipediaQuery import BlockQuery
5from banapedia.Ban import *
6
7
8class BanList():
9
10 def __init__(self, data_file, samples=30000, samples_by_query=500,
11 from_internet=False):
12 if from_internet:
13 proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
14 opener = urllib.request.build_opener(proxy)
15 urllib.request.install_opener(opener)
16 self.dict_list = self.fetch_multipart(samples, samples_by_query)
17 else:
18 with open(data_file, "r") as ban_dict_file:
19 self.dict_list = json.load(ban_dict_file)
20 self.ban_list = []
21 for ban_dict in self.dict_list:
22 self.ban_list.append(Ban(
23 ban_dict["user"],
24 datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP),
25 datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP),
26 ))
27
28 def fetch_multipart(self, n, query_limit):
29 ban_dict_list = []
30 n_fetched = 0
31 continue_token = None
32
33 print("[INFO]", "Fetching %d bans" % n)
34 while n_fetched < n:
35 to_fetch = min(query_limit, n - n_fetched)
36 query = BlockQuery(
37 bkprop=["user", "timestamp", "expiry"],
38 bkshow=["temp", "ip"],
39 limit=to_fetch,
40 continue_token=continue_token,
41 )
42 results = query.fetch_result()
43 ban_dict_list.extend(results["query"]["blocks"])
44 continue_token = results["query-continue"]["blocks"]["bkcontinue"]
45 n_fetched += to_fetch
46 print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n))
47
48 print("[INFO]", "Bans fetching complete")
49 return ban_dict_list
50
51 def write_to_file(self, outfile):
52 with open(outfile, "w") as ban_dict_file:
53 json.dump(self.dict_list, ban_dict_file, indent="\t")
54
55 def get_durations(self):
56 return [ban.get_duration() for ban in self.ban_list]
57
58 def get_countries(self):
59 return [ban.get_country_code() for ban in self.ban_list]
60
61 def __iter__(self):
62 return self.dict_list.__iter__()
63
64 def items(self):
65 return self.dict_list.items()
66
67 def by_country(self):
68 ban_duration_by_country = {}
69
70 for ban in self.ban_list:
71 country_code = ban.get_country_code()
72
73 if country_code not in ban_duration_by_country.keys():
74 ban_duration_by_country[country_code] = []
75
76 ban_duration_by_country[country_code].append(ban)
77
78 return ban_duration_by_country
79
80 def average_ban_by_country(self):
81 average_ban_duration_ctry = {}
82
83 for country, bans in self.by_country().items():
84 average = np.mean([ban.get_duration() for ban in bans])
85 average_ban_duration_ctry[country] = average
86
87 # In months
88 average_ban_duration_ctry = {country: duration/30
89 for country, duration in
90 average_ban_duration_ctry.items()}
91 return average_ban_duration_ctry
diff --git a/main.py b/main.py
index dff07f5..548cb30 100644
--- a/main.py
+++ b/main.py
@@ -1,9 +1,8 @@
1from banapedia.Ban import * 1from banapedia.Ban import *
2import bandict
2from collections import Counter 3from collections import Counter
3import json
4import pygal 4import pygal
5import numpy as np 5import numpy as np
6import urllib.request
7 6
8__author__ = 'pacien' 7__author__ = 'pacien'
9 8
@@ -12,44 +11,15 @@ BAN_MAP_FILE = "output/ban-map.svg"
12BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg" 11BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg"
13HIST_FILE = "output/histogram.svg" 12HIST_FILE = "output/histogram.svg"
14 13
15BAN_FILE = "resources/ban_list.json"
16
17SAMPLES = 30000 14SAMPLES = 30000
18SAMPLES_BY_QUERY = 500
19
20
21def configure_proxy():
22 proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
23 opener = urllib.request.build_opener(proxy)
24 urllib.request.install_opener(opener)
25
26
27def load_from_internet():
28 configure_proxy()
29 return fetch_multipart_ban_dict(SAMPLES, SAMPLES_BY_QUERY)
30
31
32def load_from_local():
33 with open(BAN_FILE, "r") as ban_dict_file:
34 return json.load(ban_dict_file)
35
36
37def write_to_local(ban_dict_list):
38 with open(BAN_FILE, "w") as ban_dict_file:
39 json.dump(ban_dict_list, ban_dict_file, indent="\t")
40
41
42# ban_dict_list = load_from_internet()
43# write_to_local(ban_dict_list)
44
45ban_dict_list = load_from_local()
46 15
47ban_list = map_bans(ban_dict_list) 16BAN_FILE = "resources/ban_list.json"
48 17
18ban_dict_list = bandict.BanList(BAN_FILE)
49 19
50########## HISTOGRAM ########## 20# ======== HISTOGRAM ======= #
51 21
52ban_durations = [ban.get_duration() for ban in ban_list] 22ban_durations = ban_dict_list.get_durations()
53(ban_durations_bars, bins) = np.histogram(ban_durations, bins=[round(365/12*x) for x in range(1, 50+2)]) 23(ban_durations_bars, bins) = np.histogram(ban_durations, bins=[round(365/12*x) for x in range(1, 50+2)])
54 24
55print("[INFO]", "Generating histogram") 25print("[INFO]", "Generating histogram")
@@ -60,13 +30,10 @@ bar_chart.add("Number of active bans", ban_durations_bars)
60bar_chart.render_to_file(HIST_FILE) 30bar_chart.render_to_file(HIST_FILE)
61print("[INFO]", "Histogram generation complete") 31print("[INFO]", "Histogram generation complete")
62 32
63########## NB BAN MAP ########## 33# ======= NB BAN MAP ======= #
64
65def count_by_country(ban_list):
66 country_ban_list = [ban.get_country_code() for ban in ban_list]
67 return Counter(country_ban_list)
68 34
69nb_bans_by_country = count_by_country(ban_list) 35country_ban_list = ban_dict_list.get_countries()
36nb_bans_by_country = Counter(country_ban_list)
70 37
71print("[INFO]", "Generating ban map") 38print("[INFO]", "Generating ban map")
72worldmap_chart = pygal.Worldmap(legend_at_bottom=True) 39worldmap_chart = pygal.Worldmap(legend_at_bottom=True)
@@ -76,35 +43,9 @@ worldmap_chart.render_to_file(BAN_MAP_FILE)
76print("[INFO]", "Ban map generation complete") 43print("[INFO]", "Ban map generation complete")
77 44
78 45
79########## BAN DURATION MAP ########## 46# ======= BAN DURATION MAP ======= #
80
81def group_by_country(ban_list):
82 ban_duration_by_country = {}
83
84 for ban in ban_list:
85 country_code = ban.get_country_code()
86
87 if country_code not in ban_duration_by_country.keys():
88 ban_duration_by_country[country_code] = []
89
90 ban_duration_by_country[country_code].append(ban)
91
92 return ban_duration_by_country
93 47
94 48average_ban_duration_by_country = ban_dict_list.average_ban_by_country()
95def calc_average_ban_by_country(ban_by_country_dict):
96 average_ban_duration_by_country = {}
97
98 for country, bans in ban_by_country_dict.items():
99 average = np.mean([ban.get_duration() for ban in bans])