blkcg: cfq doesn't need per-cpu dispatch stats

blkio_group_stats_cpu is used to count dispatch stats using per-cpu
counters.  This is used by both blk-throtl and cfq-iosched but the
sharing is rather silly.

* cfq-iosched doesn't need per-cpu dispatch stats.  cfq always updates
  those stats while holding queue_lock.

* blk-throtl needs per-cpu dispatch stats but only service_bytes and
  serviced.  It doesn't make use of sectors.

This patch makes cfq add and use global stats for service_bytes,
serviced and sectors, removes per-cpu sectors counter and moves
per-cpu stat printing code to blk-throttle.c.

Signed-off-by: Tejun Heo <tj@kernel.org>
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5d647ed..cb259bc4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -582,7 +582,6 @@
 
 	stats_cpu = this_cpu_ptr(pd->stats_cpu);
 
-	blkg_stat_add(&stats_cpu->sectors, bytes >> 9);
 	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
 	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
 
@@ -843,6 +842,36 @@
 	throtl_schedule_delayed_work(td, 0);
 }
 
+static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
+				  struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat rwstat = { }, tmp;
+	int i, cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct blkio_group_stats_cpu *sc =
+			per_cpu_ptr(pd->stats_cpu, cpu);
+
+		tmp = blkg_rwstat_read((void *)sc + off);
+		for (i = 0; i < BLKG_RWSTAT_NR; i++)
+			rwstat.cnt[i] += tmp.cnt[i];
+	}
+
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
+static int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
+				  struct seq_file *sf)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_rwstat,
+			  BLKCG_STAT_POL(cft->private),
+			  BLKCG_STAT_OFF(cft->private), true);
+	return 0;
+}
+
 static u64 blkg_prfill_conf_u64(struct seq_file *sf,
 				struct blkg_policy_data *pd, int off)
 {