From a95864928cd74df172f172d7c18b4d89e82548e1 Mon Sep 17 00:00:00 2001 From: Minijackson Date: Fri, 24 Oct 2014 10:20:14 +0200 Subject: Moving to object type ban dict --- banapedia/Ban.py | 40 ----------------------- bandict/__init__.py | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 83 ++++++------------------------------------------ 3 files changed, 101 insertions(+), 113 deletions(-) create mode 100644 bandict/__init__.py diff --git a/banapedia/Ban.py b/banapedia/Ban.py index d8666b4..4714274 100644 --- a/banapedia/Ban.py +++ b/banapedia/Ban.py @@ -34,43 +34,3 @@ class Ban: self.country_code = country_code return country_code - - -def map_ban(ban_dict): - return Ban( - ban_dict["user"], - datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP), - datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP), - ) - - -def map_bans(ban_dict_list): - ban_list = [] - for ban_dict in ban_dict_list: - ban_list.append(map_ban(ban_dict)) - - return ban_list - - -def fetch_multipart_ban_dict(n, query_limit): - ban_dict_list = [] - n_fetched = 0 - continue_token = None - - print("[INFO]", "Fetching %d bans" % n) - while n_fetched < n: - to_fetch = min(query_limit, n - n_fetched) - query = BlockQuery( - bkprop=["user", "timestamp", "expiry"], - bkshow=["temp", "ip"], - limit=to_fetch, - continue_token=continue_token, - ) - results = query.fetch_result() - ban_dict_list.extend(results["query"]["blocks"]) - continue_token = results["query-continue"]["blocks"]["bkcontinue"] - n_fetched += to_fetch - print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n)) - - print("[INFO]", "Bans fetching complete") - return ban_dict_list diff --git a/bandict/__init__.py b/bandict/__init__.py new file mode 100644 index 0000000..389ae0e --- /dev/null +++ b/bandict/__init__.py @@ -0,0 +1,91 @@ +import urllib.request +import json +import numpy as np +from banapedia.wapi.WikipediaQuery import BlockQuery +from banapedia.Ban import * + + +class BanList(): + + def __init__(self, data_file, samples=30000, samples_by_query=500, + from_internet=False): + if from_internet: + proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) + opener = urllib.request.build_opener(proxy) + urllib.request.install_opener(opener) + self.dict_list = self.fetch_multipart(samples, samples_by_query) + else: + with open(data_file, "r") as ban_dict_file: + self.dict_list = json.load(ban_dict_file) + self.ban_list = [] + for ban_dict in self.dict_list: + self.ban_list.append(Ban( + ban_dict["user"], + datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP), + datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP), + )) + + def fetch_multipart(self, n, query_limit): + ban_dict_list = [] + n_fetched = 0 + continue_token = None + + print("[INFO]", "Fetching %d bans" % n) + while n_fetched < n: + to_fetch = min(query_limit, n - n_fetched) + query = BlockQuery( + bkprop=["user", "timestamp", "expiry"], + bkshow=["temp", "ip"], + limit=to_fetch, + continue_token=continue_token, + ) + results = query.fetch_result() + ban_dict_list.extend(results["query"]["blocks"]) + continue_token = results["query-continue"]["blocks"]["bkcontinue"] + n_fetched += to_fetch + print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n)) + + print("[INFO]", "Bans fetching complete") + return ban_dict_list + + def write_to_file(self, outfile): + with open(outfile, "w") as ban_dict_file: + json.dump(self.dict_list, ban_dict_file, indent="\t") + + def get_durations(self): + return [ban.get_duration() for ban in self.ban_list] + + def get_countries(self): + return [ban.get_country_code() for ban in self.ban_list] + + def __iter__(self): + return self.dict_list.__iter__() + + def items(self): + return self.dict_list.items() + + def by_country(self): + ban_duration_by_country = {} + + for ban in self.ban_list: + country_code = ban.get_country_code() + + if country_code not in ban_duration_by_country.keys(): + ban_duration_by_country[country_code] = [] + + ban_duration_by_country[country_code].append(ban) + + return ban_duration_by_country + + def average_ban_by_country(self): + average_ban_duration_ctry = {} + + for country, bans in self.by_country().items(): + average = np.mean([ban.get_duration() for ban in bans]) + average_ban_duration_ctry[country] = average + + # In months + average_ban_duration_ctry = {country: duration/30 + for country, duration in + average_ban_duration_ctry.items()} + return average_ban_duration_ctry diff --git a/main.py b/main.py index dff07f5..548cb30 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,8 @@ from banapedia.Ban import * +import bandict from collections import Counter -import json import pygal import numpy as np -import urllib.request __author__ = 'pacien' @@ -12,44 +11,15 @@ BAN_MAP_FILE = "output/ban-map.svg" BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg" HIST_FILE = "output/histogram.svg" -BAN_FILE = "resources/ban_list.json" - SAMPLES = 30000 -SAMPLES_BY_QUERY = 500 - - -def configure_proxy(): - proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) - opener = urllib.request.build_opener(proxy) - urllib.request.install_opener(opener) - - -def load_from_internet(): - configure_proxy() - return fetch_multipart_ban_dict(SAMPLES, SAMPLES_BY_QUERY) - - -def load_from_local(): - with open(BAN_FILE, "r") as ban_dict_file: - return json.load(ban_dict_file) - - -def write_to_local(ban_dict_list): - with open(BAN_FILE, "w") as ban_dict_file: - json.dump(ban_dict_list, ban_dict_file, indent="\t") - - -# ban_dict_list = load_from_internet() -# write_to_local(ban_dict_list) - -ban_dict_list = load_from_local() -ban_list = map_bans(ban_dict_list) +BAN_FILE = "resources/ban_list.json" +ban_dict_list = bandict.BanList(BAN_FILE) -########## HISTOGRAM ########## +# ======== HISTOGRAM ======= # -ban_durations = [ban.get_duration() for ban in ban_list] +ban_durations = ban_dict_list.get_durations() (ban_durations_bars, bins) = np.histogram(ban_durations, bins=[round(365/12*x) for x in range(1, 50+2)]) print("[INFO]", "Generating histogram") @@ -60,13 +30,10 @@ bar_chart.add("Number of active bans", ban_durations_bars) bar_chart.render_to_file(HIST_FILE) print("[INFO]", "Histogram generation complete") -########## NB BAN MAP ########## - -def count_by_country(ban_list): - country_ban_list = [ban.get_country_code() for ban in ban_list] - return Counter(country_ban_list) +# ======= NB BAN MAP ======= # -nb_bans_by_country = count_by_country(ban_list) +country_ban_list = ban_dict_list.get_countries() +nb_bans_by_country = Counter(country_ban_list) print("[INFO]", "Generating ban map") worldmap_chart = pygal.Worldmap(legend_at_bottom=True) @@ -76,35 +43,9 @@ worldmap_chart.render_to_file(BAN_MAP_FILE) print("[INFO]", "Ban map generation complete") -########## BAN DURATION MAP ########## - -def group_by_country(ban_list): - ban_duration_by_country = {} - - for ban in ban_list: - country_code = ban.get_country_code() - - if country_code not in ban_duration_by_country.keys(): - ban_duration_by_country[country_code] = [] - - ban_duration_by_country[country_code].append(ban) - - return ban_duration_by_country +# ======= BAN DURATION MAP ======= # - -def calc_average_ban_by_country(ban_by_country_dict): - average_ban_duration_by_country = {} - - for country, bans in ban_by_country_dict.items(): - average = np.mean([ban.get_duration() for ban in bans]) - average_ban_duration_by_country[country] = average - - return average_ban_duration_by_country - -ban_duration_by_country = group_by_country(ban_list) -average_ban_duration_by_country = calc_average_ban_by_country(ban_duration_by_country) - -average_ban_duration_by_country = {country: duration/30 for country, duration in average_ban_duration_by_country.items()} +average_ban_duration_by_country = ban_dict_list.average_ban_by_country() print("[INFO]", "Generating ban duration map") worldmap_chart = pygal.Worldmap(legend_at_bottom=True) @@ -113,10 +54,6 @@ worldmap_chart.add("Average ban duration (months)", average_ban_duration_by_coun worldmap_chart.render_to_file(BAN_DURATION_MAP_FILE) print("[INFO]", "Ban duration map generation complete") -print("\nTHIS WAS A TRIUMPH!") -print("I'M MAKING A NOTE HERE:") -print("HUGE [SUCCESS]\n") - print("Some additional stats about ban durations:") print(" Mean: %.2f days" % np.mean(ban_durations)) print(" Median: %.2f days" % np.median(ban_durations)) -- cgit v1.2.3