'''
Data munging library to convert database-style records into categorized summaries.
'''

import collections

def add_percent_labels(labels, values):
	total = sum(values)
	return [ '{} ({:1.1f}%)'.format(l, 100 * (v / total)) for l, v in zip(labels, values) ]

def accumulate(things, sieve, count=lambda t: 1):
	'''Collate things into categories and count (or accumulate) them
	'''
	results = collections.defaultdict(int)
	for t in things:
		results[sieve(t)] += count(t)
	return results

def collate(things, sieve):
	'''Collate things into categories
	'''
	results = collections.defaultdict(list)
	for t in things:
		results[sieve(t)].append(t)
	return results

def rank(things):
	'''Take a sorted (k, v) sequence and add a rank to each element (r, k, v).'''
	results = []
	rank = 1
	prev_v = None
	for c, (k, v) in enumerate(things, 1):
		if v != prev_v:
			prev_v = v
			rank = c
		results.append((rank, k, v))
	return results

def simplify(things, threshold=0.005):
	total = sum(things.values())
	remaining = 0

	for k in tuple(things.keys()):
		if things[k] / total < threshold:
			remaining += things[k]
			del things[k]

	return remaining

def accumulate_2d(things, primary_sieve, secondary_sieve, count=lambda t: 1):
	'''Collate the things in two different dimensions, then count them'''
	data = collate(things, primary_sieve)
	for k in data.keys():
		data[k] = accumulate(data[k], secondary_sieve, count)
	return data

def simplify_2d(things, threshold=0.03, category='Other', unconditional=()):
	'''Merge small values into a special category.

	Expected a dictionary of dictionaries of numbers [output of accumulate_2d()].
	'''

	# Find the largest primary category and create a list of every possible
	# secondary key
	max_sum = 0
	keys = set()
	for secondary in things.values():
		total = sum(secondary.values())
		if total > max_sum:
			max_sum = total
		keys |= set(secondary.keys())

	# Recalculate the threshold (as a fraction of the largest primary category)
	threshold = threshold * max_sum

	# Work through each secondary and remove keys that exceed the threshold
	# from the cull list
	keys_to_cull = keys
	for secondary in things.values():
		for k, v in secondary.items():
			if v >= threshold:
				keys_to_cull.discard(k)
	keys_to_cull |= set(unconditional)

	# Work through each secondary simplifying each one
	for secondary in things.values():
		for k in tuple(secondary.keys()):
			if k in keys_to_cull:
				if category not in secondary:
					secondary[category] = 0
				secondary[category] += secondary[k]
				del secondary[k]