aboutsummaryrefslogtreecommitdiff
path: root/util/qdist.c
blob: eb2236cdc85ac4594e5e385128a9b32c69f70268 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
/*
 * qdist.c - QEMU helpers for handling frequency distributions of data.
 *
 * Copyright (C) 2016, Emilio G. Cota <cota@braap.org>
 *
 * License: GNU GPL, version 2 or later.
 *   See the COPYING file in the top-level directory.
 */
#include "qemu/osdep.h"
#include "qemu/qdist.h"

#include <math.h>
#ifndef NAN
#define NAN (0.0 / 0.0)
#endif

void qdist_init(struct qdist *dist)
{
    dist->entries = g_malloc(sizeof(*dist->entries));
    dist->size = 1;
    dist->n = 0;
}

void qdist_destroy(struct qdist *dist)
{
    g_free(dist->entries);
}

static inline int qdist_cmp_double(double a, double b)
{
    if (a > b) {
        return 1;
    } else if (a < b) {
        return -1;
    }
    return 0;
}

static int qdist_cmp(const void *ap, const void *bp)
{
    const struct qdist_entry *a = ap;
    const struct qdist_entry *b = bp;

    return qdist_cmp_double(a->x, b->x);
}

void qdist_add(struct qdist *dist, double x, long count)
{
    struct qdist_entry *entry = NULL;

    if (dist->n) {
        struct qdist_entry e;

        e.x = x;
        entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp);
    }

    if (entry) {
        entry->count += count;
        return;
    }

    if (unlikely(dist->n == dist->size)) {
        dist->size *= 2;
        dist->entries = g_realloc(dist->entries,
                                  sizeof(*dist->entries) * (dist->size));
    }
    dist->n++;
    entry = &dist->entries[dist->n - 1];
    entry->x = x;
    entry->count = count;
    qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp);
}

void qdist_inc(struct qdist *dist, double x)
{
    qdist_add(dist, x, 1);
}

/*
 * Unicode for block elements. See:
 *   https://en.wikipedia.org/wiki/Block_Elements
 */
static const gunichar qdist_blocks[] = {
    0x2581,
    0x2582,
    0x2583,
    0x2584,
    0x2585,
    0x2586,
    0x2587,
    0x2588
};

#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks)

/*
 * Print a distribution into a string.
 *
 * This function assumes that appropriate binning has been done on the input;
 * see qdist_bin__internal() and qdist_pr_plain().
 *
 * Callers must free the returned string with g_free().
 */
static char *qdist_pr_internal(const struct qdist *dist)
{
    double min, max;
    GString *s = g_string_new("");
    size_t i;

    /* if only one entry, its printout will be either full or empty */
    if (dist->n == 1) {
        if (dist->entries[0].count) {
            g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - 1]);
        } else {
            g_string_append_c(s, ' ');
        }
        goto out;
    }

    /* get min and max counts */
    min = dist->entries[0].count;
    max = min;
    for (i = 0; i < dist->n; i++) {
        struct qdist_entry *e = &dist->entries[i];

        if (e->count < min) {
            min = e->count;
        }
        if (e->count > max) {
            max = e->count;
        }
    }

    for (i = 0; i < dist->n; i++) {
        struct qdist_entry *e = &dist->entries[i];
        int index;

        /* make an exception with 0; instead of using block[0], print a space */
        if (e->count) {
            /* divide first to avoid loss of precision when e->count == max */
            index = (e->count - min) / (max - min) * (QDIST_NR_BLOCK_CODES - 1);
            g_string_append_unichar(s, qdist_blocks[index]);
        } else {
            g_string_append_c(s, ' ');
        }
    }
 out:
    return g_string_free(s, FALSE);
}

/*
 * Bin the distribution in @from into @n bins of consecutive, non-overlapping
 * intervals, copying the result to @to.
 *
 * This function is internal to qdist: only this file and test code should
 * ever call it.
 *
 * Note: calling this function on an already-binned qdist is a bug.
 *
 * If @n == 0 or @from->n == 1, use @from->n.
 */
void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n)
{
    double xmin, xmax;
    double step;
    size_t i, j;

    qdist_init(to);

    if (from->n == 0) {
        return;
    }
    if (n == 0 || from->n == 1) {
        n = from->n;
    }

    /* set equally-sized bins between @from's left and right */
    xmin = qdist_xmin(from);
    xmax = qdist_xmax(from);
    step = (xmax - xmin) / n;

    if (n == from->n) {
        /* if @from's entries are equally spaced, no need to re-bin */
        for (i = 0; i < from->n; i++) {
            if (from->entries[i].x != xmin + i * step) {
                goto rebin;
            }
        }
        /* they're equally spaced, so copy the dist and bail out */
        to->entries = g_realloc_n(to->entries, n, sizeof(*to->entries));
        to->n = from->n;
        memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n);
        return;
    }

 rebin:
    j = 0;
    for (i = 0; i < n; i++) {
        double x;
        double left, right;

        left = xmin + i * step;
        right = xmin + (i + 1) * step;

        /* Add x, even if it might not get any counts later */
        x = left;
        qdist_add(to, x, 0);

        /*
         * To avoid double-counting we capture [left, right) ranges, except for
         * the righmost bin, which captures a [left, right] range.
         */
        while (j < from->n && (from->entries[j].x < right || i == n - 1)) {
            struct qdist_entry *o = &from->entries[j];

            qdist_add(to, x, o->count);
            j++;
        }
    }
}

/*
 * Print @dist into a string, after re-binning it into @n bins of consecutive,
 * non-overlapping intervals.
 *
 * If @n == 0, use @orig->n.
 *
 * Callers must free the returned string with g_free().
 */
char *qdist_pr_plain(const struct qdist *dist, size_t n)
{
    struct qdist binned;
    char *ret;

    if (dist->n == 0) {
        return NULL;
    }
    qdist_bin__internal(&binned, dist, n);
    ret = qdist_pr_internal(&binned);
    qdist_destroy(&binned);
    return ret;
}

static char *qdist_pr_label(const struct qdist *dist, size_t n_bins,
                            uint32_t opt, bool is_left)
{
    const char *percent;
    const char *lparen;
    const char *rparen;
    GString *s;
    double x1, x2, step;
    double x;
    double n;
    int dec;

    s = g_string_new("");
    if (!(opt & QDIST_PR_LABELS)) {
        goto out;
    }

    dec = opt & QDIST_PR_NODECIMAL ? 0 : 1;
    percent = opt & QDIST_PR_PERCENT ? "%" : "";

    n = n_bins ? n_bins : dist->n;
    x = is_left ? qdist_xmin(dist) : qdist_xmax(dist);
    step = (qdist_xmax(dist) - qdist_xmin(dist)) / n;

    if (opt & QDIST_PR_100X) {
        x *= 100.0;
        step *= 100.0;
    }
    if (opt & QDIST_PR_NOBINRANGE) {
        lparen = rparen = "";
        x1 = x;
        x2 = x; /* unnecessary, but a dumb compiler might not figure it out */
    } else {
        lparen = "[";
        rparen = is_left ? ")" : "]";
        if (is_left) {
            x1 = x;
            x2 = x + step;
        } else {
            x1 = x - step;
            x2 = x;
        }
    }
    g_string_append_printf(s, "%s%.*f", lparen, dec, x1);
    if (!(opt & QDIST_PR_NOBINRANGE)) {
        g_string_append_printf(s, ",%.*f%s", dec, x2, rparen);
    }
    g_string_append(s, percent);
 out:
    return g_string_free(s, FALSE);
}

/*
 * Print the distribution's histogram into a string.
 *
 * See also: qdist_pr_plain().
 *
 * Callers must free the returned string with g_free().
 */
char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt)
{
    const char *border = opt & QDIST_PR_BORDER ? "|" : "";
    char *llabel, *rlabel;
    char *hgram;
    GString *s;

    if (dist->n == 0) {
        return NULL;
    }

    s = g_string_new("");

    llabel = qdist_pr_label(dist, n_bins, opt, true);
    rlabel = qdist_pr_label(dist, n_bins, opt, false);
    hgram = qdist_pr_plain(dist, n_bins);
    g_string_append_printf(s, "%s%s%s%s%s",
                           llabel, border, hgram, border, rlabel);
    g_free(llabel);
    g_free(rlabel);
    g_free(hgram);

    return g_string_free(s, FALSE);
}

static inline double qdist_x(const struct qdist *dist, int index)
{
    if (dist->n == 0) {
        return NAN;
    }
    return dist->entries[index].x;
}

double qdist_xmin(const struct qdist *dist)
{
    return qdist_x(dist, 0);
}

double qdist_xmax(const struct qdist *dist)
{
    return qdist_x(dist, dist->n - 1);
}

size_t qdist_unique_entries(const struct qdist *dist)
{
    return dist->n;
}

unsigned long qdist_sample_count(const struct qdist *dist)
{
    unsigned long count = 0;
    size_t i;

    for (i = 0; i < dist->n; i++) {
        struct qdist_entry *e = &dist->entries[i];

        count += e->count;
    }
    return count;
}

static double qdist_pairwise_avg(const struct qdist *dist, size_t index,
                                 size_t n, unsigned long count)
{
    /* amortize the recursion by using a base case > 2 */
    if (n <= 8) {
        size_t i;
        double ret = 0;

        for (i = 0; i < n; i++) {
            struct qdist_entry *e = &dist->entries[index + i];

            ret += e->x * e->count / count;
        }
        return ret;
    } else {
        size_t n2 = n / 2;

        return qdist_pairwise_avg(dist, index, n2, count) +
               qdist_pairwise_avg(dist, index + n2, n - n2, count);
    }
}

double qdist_avg(const struct qdist *dist)
{
    unsigned long count;

    count = qdist_sample_count(dist);
    if (!count) {
        return NAN;
    }
    return qdist_pairwise_avg(dist, 0, dist->n, count);
}