blob: d1b4a4624d81fafbae6c68e5e64423ad5b33fc13 [file] [log] [blame]
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001/*
2 * Copyright (c) 2012, Microsoft Corporation.
3 *
4 * Author:
5 * K. Y. Srinivasan <kys@microsoft.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/kernel.h>
22#include <linux/mman.h>
23#include <linux/delay.h>
24#include <linux/init.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/kthread.h>
28#include <linux/completion.h>
29#include <linux/memory_hotplug.h>
30#include <linux/memory.h>
31#include <linux/notifier.h>
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -080032#include <linux/percpu_counter.h>
33
34#include <linux/hyperv.h>
35
36/*
37 * We begin with definitions supporting the Dynamic Memory protocol
38 * with the host.
39 *
40 * Begin protocol definitions.
41 */
42
43
44
45/*
46 * Protocol versions. The low word is the minor version, the high word the major
47 * version.
48 *
49 * History:
50 * Initial version 1.0
51 * Changed to 0.1 on 2009/03/25
52 * Changes to 0.2 on 2009/05/14
53 * Changes to 0.3 on 2009/12/03
54 * Changed to 1.0 on 2011/04/05
55 */
56
57#define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor)))
58#define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16)
59#define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff)
60
61enum {
62 DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3),
63 DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0),
64
65 DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1,
66 DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2,
67
68 DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN8
69};
70
71
72
73/*
74 * Message Types
75 */
76
77enum dm_message_type {
78 /*
79 * Version 0.3
80 */
81 DM_ERROR = 0,
82 DM_VERSION_REQUEST = 1,
83 DM_VERSION_RESPONSE = 2,
84 DM_CAPABILITIES_REPORT = 3,
85 DM_CAPABILITIES_RESPONSE = 4,
86 DM_STATUS_REPORT = 5,
87 DM_BALLOON_REQUEST = 6,
88 DM_BALLOON_RESPONSE = 7,
89 DM_UNBALLOON_REQUEST = 8,
90 DM_UNBALLOON_RESPONSE = 9,
91 DM_MEM_HOT_ADD_REQUEST = 10,
92 DM_MEM_HOT_ADD_RESPONSE = 11,
93 DM_VERSION_03_MAX = 11,
94 /*
95 * Version 1.0.
96 */
97 DM_INFO_MESSAGE = 12,
98 DM_VERSION_1_MAX = 12
99};
100
101
102/*
103 * Structures defining the dynamic memory management
104 * protocol.
105 */
106
107union dm_version {
108 struct {
109 __u16 minor_version;
110 __u16 major_version;
111 };
112 __u32 version;
113} __packed;
114
115
116union dm_caps {
117 struct {
118 __u64 balloon:1;
119 __u64 hot_add:1;
K. Y. Srinivasan647965a2013-03-29 07:36:11 -0700120 /*
121 * To support guests that may have alignment
122 * limitations on hot-add, the guest can specify
123 * its alignment requirements; a value of n
124 * represents an alignment of 2^n in mega bytes.
125 */
126 __u64 hot_add_alignment:4;
127 __u64 reservedz:58;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800128 } cap_bits;
129 __u64 caps;
130} __packed;
131
132union dm_mem_page_range {
133 struct {
134 /*
135 * The PFN number of the first page in the range.
136 * 40 bits is the architectural limit of a PFN
137 * number for AMD64.
138 */
139 __u64 start_page:40;
140 /*
141 * The number of pages in the range.
142 */
143 __u64 page_cnt:24;
144 } finfo;
145 __u64 page_range;
146} __packed;
147
148
149
150/*
151 * The header for all dynamic memory messages:
152 *
153 * type: Type of the message.
154 * size: Size of the message in bytes; including the header.
155 * trans_id: The guest is responsible for manufacturing this ID.
156 */
157
158struct dm_header {
159 __u16 type;
160 __u16 size;
161 __u32 trans_id;
162} __packed;
163
164/*
165 * A generic message format for dynamic memory.
166 * Specific message formats are defined later in the file.
167 */
168
169struct dm_message {
170 struct dm_header hdr;
171 __u8 data[]; /* enclosed message */
172} __packed;
173
174
175/*
176 * Specific message types supporting the dynamic memory protocol.
177 */
178
179/*
180 * Version negotiation message. Sent from the guest to the host.
181 * The guest is free to try different versions until the host
182 * accepts the version.
183 *
184 * dm_version: The protocol version requested.
185 * is_last_attempt: If TRUE, this is the last version guest will request.
186 * reservedz: Reserved field, set to zero.
187 */
188
189struct dm_version_request {
190 struct dm_header hdr;
191 union dm_version version;
192 __u32 is_last_attempt:1;
193 __u32 reservedz:31;
194} __packed;
195
196/*
197 * Version response message; Host to Guest and indicates
198 * if the host has accepted the version sent by the guest.
199 *
200 * is_accepted: If TRUE, host has accepted the version and the guest
201 * should proceed to the next stage of the protocol. FALSE indicates that
202 * guest should re-try with a different version.
203 *
204 * reservedz: Reserved field, set to zero.
205 */
206
207struct dm_version_response {
208 struct dm_header hdr;
209 __u64 is_accepted:1;
210 __u64 reservedz:63;
211} __packed;
212
213/*
214 * Message reporting capabilities. This is sent from the guest to the
215 * host.
216 */
217
218struct dm_capabilities {
219 struct dm_header hdr;
220 union dm_caps caps;
221 __u64 min_page_cnt;
222 __u64 max_page_number;
223} __packed;
224
225/*
226 * Response to the capabilities message. This is sent from the host to the
227 * guest. This message notifies if the host has accepted the guest's
228 * capabilities. If the host has not accepted, the guest must shutdown
229 * the service.
230 *
231 * is_accepted: Indicates if the host has accepted guest's capabilities.
232 * reservedz: Must be 0.
233 */
234
235struct dm_capabilities_resp_msg {
236 struct dm_header hdr;
237 __u64 is_accepted:1;
238 __u64 reservedz:63;
239} __packed;
240
241/*
242 * This message is used to report memory pressure from the guest.
243 * This message is not part of any transaction and there is no
244 * response to this message.
245 *
246 * num_avail: Available memory in pages.
247 * num_committed: Committed memory in pages.
248 * page_file_size: The accumulated size of all page files
249 * in the system in pages.
250 * zero_free: The nunber of zero and free pages.
251 * page_file_writes: The writes to the page file in pages.
252 * io_diff: An indicator of file cache efficiency or page file activity,
253 * calculated as File Cache Page Fault Count - Page Read Count.
254 * This value is in pages.
255 *
256 * Some of these metrics are Windows specific and fortunately
257 * the algorithm on the host side that computes the guest memory
258 * pressure only uses num_committed value.
259 */
260
261struct dm_status {
262 struct dm_header hdr;
263 __u64 num_avail;
264 __u64 num_committed;
265 __u64 page_file_size;
266 __u64 zero_free;
267 __u32 page_file_writes;
268 __u32 io_diff;
269} __packed;
270
271
272/*
273 * Message to ask the guest to allocate memory - balloon up message.
274 * This message is sent from the host to the guest. The guest may not be
275 * able to allocate as much memory as requested.
276 *
277 * num_pages: number of pages to allocate.
278 */
279
280struct dm_balloon {
281 struct dm_header hdr;
282 __u32 num_pages;
283 __u32 reservedz;
284} __packed;
285
286
287/*
288 * Balloon response message; this message is sent from the guest
289 * to the host in response to the balloon message.
290 *
291 * reservedz: Reserved; must be set to zero.
292 * more_pages: If FALSE, this is the last message of the transaction.
293 * if TRUE there will atleast one more message from the guest.
294 *
295 * range_count: The number of ranges in the range array.
296 *
297 * range_array: An array of page ranges returned to the host.
298 *
299 */
300
301struct dm_balloon_response {
302 struct dm_header hdr;
303 __u32 reservedz;
304 __u32 more_pages:1;
305 __u32 range_count:31;
306 union dm_mem_page_range range_array[];
307} __packed;
308
309/*
310 * Un-balloon message; this message is sent from the host
311 * to the guest to give guest more memory.
312 *
313 * more_pages: If FALSE, this is the last message of the transaction.
314 * if TRUE there will atleast one more message from the guest.
315 *
316 * reservedz: Reserved; must be set to zero.
317 *
318 * range_count: The number of ranges in the range array.
319 *
320 * range_array: An array of page ranges returned to the host.
321 *
322 */
323
324struct dm_unballoon_request {
325 struct dm_header hdr;
326 __u32 more_pages:1;
327 __u32 reservedz:31;
328 __u32 range_count;
329 union dm_mem_page_range range_array[];
330} __packed;
331
332/*
333 * Un-balloon response message; this message is sent from the guest
334 * to the host in response to an unballoon request.
335 *
336 */
337
338struct dm_unballoon_response {
339 struct dm_header hdr;
340} __packed;
341
342
343/*
344 * Hot add request message. Message sent from the host to the guest.
345 *
346 * mem_range: Memory range to hot add.
347 *
348 * On Linux we currently don't support this since we cannot hot add
349 * arbitrary granularity of memory.
350 */
351
352struct dm_hot_add {
353 struct dm_header hdr;
354 union dm_mem_page_range range;
355} __packed;
356
357/*
358 * Hot add response message.
359 * This message is sent by the guest to report the status of a hot add request.
360 * If page_count is less than the requested page count, then the host should
361 * assume all further hot add requests will fail, since this indicates that
362 * the guest has hit an upper physical memory barrier.
363 *
364 * Hot adds may also fail due to low resources; in this case, the guest must
365 * not complete this message until the hot add can succeed, and the host must
366 * not send a new hot add request until the response is sent.
367 * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS
368 * times it fails the request.
369 *
370 *
371 * page_count: number of pages that were successfully hot added.
372 *
373 * result: result of the operation 1: success, 0: failure.
374 *
375 */
376
377struct dm_hot_add_response {
378 struct dm_header hdr;
379 __u32 page_count;
380 __u32 result;
381} __packed;
382
383/*
384 * Types of information sent from host to the guest.
385 */
386
387enum dm_info_type {
388 INFO_TYPE_MAX_PAGE_CNT = 0,
389 MAX_INFO_TYPE
390};
391
392
393/*
394 * Header for the information message.
395 */
396
397struct dm_info_header {
398 enum dm_info_type type;
399 __u32 data_size;
400} __packed;
401
402/*
403 * This message is sent from the host to the guest to pass
404 * some relevant information (win8 addition).
405 *
406 * reserved: no used.
407 * info_size: size of the information blob.
408 * info: information blob.
409 */
410
411struct dm_info_msg {
K. Y. Srinivasan6427a0d2012-12-06 11:06:54 -0800412 struct dm_header hdr;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800413 __u32 reserved;
414 __u32 info_size;
415 __u8 info[];
416};
417
418/*
419 * End protocol definitions.
420 */
421
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700422/*
423 * State to manage hot adding memory into the guest.
424 * The range start_pfn : end_pfn specifies the range
425 * that the host has asked us to hot add. The range
426 * start_pfn : ha_end_pfn specifies the range that we have
427 * currently hot added. We hot add in multiples of 128M
428 * chunks; it is possible that we may not be able to bring
429 * online all the pages in the region. The range
430 * covered_start_pfn : covered_end_pfn defines the pages that can
431 * be brough online.
432 */
433
434struct hv_hotadd_state {
435 struct list_head list;
436 unsigned long start_pfn;
437 unsigned long covered_start_pfn;
438 unsigned long covered_end_pfn;
439 unsigned long ha_end_pfn;
440 unsigned long end_pfn;
441};
442
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -0700443struct balloon_state {
444 __u32 num_pages;
445 struct work_struct wrk;
446};
447
K. Y. Srinivasanc51af822013-03-15 12:25:41 -0700448struct hot_add_wrk {
449 union dm_mem_page_range ha_page_range;
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700450 union dm_mem_page_range ha_region_range;
K. Y. Srinivasanc51af822013-03-15 12:25:41 -0700451 struct work_struct wrk;
452};
453
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700454static bool hot_add = true;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800455static bool do_hot_add;
K. Y. Srinivasane500d152013-02-08 15:57:15 -0800456/*
457 * Delay reporting memory pressure by
458 * the specified number of seconds.
459 */
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700460static uint pressure_report_delay = 45;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800461
462module_param(hot_add, bool, (S_IRUGO | S_IWUSR));
463MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");
464
K. Y. Srinivasane500d152013-02-08 15:57:15 -0800465module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR));
466MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800467static atomic_t trans_id = ATOMIC_INIT(0);
468
469static int dm_ring_size = (5 * PAGE_SIZE);
470
471/*
472 * Driver specific state.
473 */
474
475enum hv_dm_state {
476 DM_INITIALIZING = 0,
477 DM_INITIALIZED,
478 DM_BALLOON_UP,
479 DM_BALLOON_DOWN,
480 DM_HOT_ADD,
481 DM_INIT_ERROR
482};
483
484
485static __u8 recv_buffer[PAGE_SIZE];
486static __u8 *send_buffer;
487#define PAGES_IN_2M 512
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700488#define HA_CHUNK (32 * 1024)
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800489
490struct hv_dynmem_device {
491 struct hv_device *dev;
492 enum hv_dm_state state;
493 struct completion host_event;
494 struct completion config_event;
495
496 /*
497 * Number of pages we have currently ballooned out.
498 */
499 unsigned int num_pages_ballooned;
500
501 /*
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -0700502 * State to manage the ballooning (up) operation.
503 */
504 struct balloon_state balloon_wrk;
505
506 /*
K. Y. Srinivasanc51af822013-03-15 12:25:41 -0700507 * State to execute the "hot-add" operation.
508 */
509 struct hot_add_wrk ha_wrk;
510
511 /*
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700512 * This state tracks if the host has specified a hot-add
513 * region.
514 */
515 bool host_specified_ha_region;
516
517 /*
518 * State to synchronize hot-add.
519 */
520 struct completion ol_waitevent;
521 bool ha_waiting;
522 /*
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -0700523 * This thread handles hot-add
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800524 * requests from the host as well as notifying
525 * the host with regards to memory pressure in
526 * the guest.
527 */
528 struct task_struct *thread;
529
530 /*
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700531 * A list of hot-add regions.
532 */
533 struct list_head ha_region_list;
534
535 /*
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800536 * We start with the highest version we can support
537 * and downgrade based on the host; we save here the
538 * next version to try.
539 */
540 __u32 next_version;
541};
542
543static struct hv_dynmem_device dm_device;
544
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700545#ifdef CONFIG_MEMORY_HOTPLUG
546
Wei Yongjuna6025a22013-03-20 23:25:59 +0800547static void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size)
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800548{
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700549 int i;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800550
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700551 for (i = 0; i < size; i++) {
552 struct page *pg;
553 pg = pfn_to_page(start_pfn + i);
554 __online_page_set_limits(pg);
555 __online_page_increment_counters(pg);
556 __online_page_free(pg);
557 }
558}
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800559
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700560static void hv_mem_hot_add(unsigned long start, unsigned long size,
561 unsigned long pfn_count,
562 struct hv_hotadd_state *has)
563{
564 int ret = 0;
565 int i, nid, t;
566 unsigned long start_pfn;
567 unsigned long processed_pfn;
568 unsigned long total_pfn = pfn_count;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800569
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700570 for (i = 0; i < (size/HA_CHUNK); i++) {
571 start_pfn = start + (i * HA_CHUNK);
572 has->ha_end_pfn += HA_CHUNK;
573
574 if (total_pfn > HA_CHUNK) {
575 processed_pfn = HA_CHUNK;
576 total_pfn -= HA_CHUNK;
577 } else {
578 processed_pfn = total_pfn;
579 total_pfn = 0;
580 }
581
582 has->covered_end_pfn += processed_pfn;
583
584 init_completion(&dm_device.ol_waitevent);
585 dm_device.ha_waiting = true;
586
587 nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
588 ret = add_memory(nid, PFN_PHYS((start_pfn)),
589 (HA_CHUNK << PAGE_SHIFT));
590
591 if (ret) {
592 pr_info("hot_add memory failed error is %d\n", ret);
593 has->ha_end_pfn -= HA_CHUNK;
594 has->covered_end_pfn -= processed_pfn;
595 break;
596 }
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800597
598 /*
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700599 * Wait for the memory block to be onlined.
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800600 */
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700601 t = wait_for_completion_timeout(&dm_device.ol_waitevent, 5*HZ);
602 if (t == 0) {
603 pr_info("hot_add memory timedout\n");
604 has->ha_end_pfn -= HA_CHUNK;
605 has->covered_end_pfn -= processed_pfn;
606 break;
607 }
608
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800609 }
610
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700611 return;
612}
613
614static void hv_online_page(struct page *pg)
615{
616 struct list_head *cur;
617 struct hv_hotadd_state *has;
618 unsigned long cur_start_pgp;
619 unsigned long cur_end_pgp;
620
621 if (dm_device.ha_waiting) {
622 dm_device.ha_waiting = false;
623 complete(&dm_device.ol_waitevent);
624 }
625
626 list_for_each(cur, &dm_device.ha_region_list) {
627 has = list_entry(cur, struct hv_hotadd_state, list);
628 cur_start_pgp = (unsigned long)
629 pfn_to_page(has->covered_start_pfn);
630 cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn);
631
632 if (((unsigned long)pg >= cur_start_pgp) &&
633 ((unsigned long)pg < cur_end_pgp)) {
634 /*
635 * This frame is currently backed; online the
636 * page.
637 */
638 __online_page_set_limits(pg);
639 __online_page_increment_counters(pg);
640 __online_page_free(pg);
641 has->covered_start_pfn++;
642 }
643 }
644}
645
646static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
647{
648 struct list_head *cur;
649 struct hv_hotadd_state *has;
650 unsigned long residual, new_inc;
651
652 if (list_empty(&dm_device.ha_region_list))
653 return false;
654
655 list_for_each(cur, &dm_device.ha_region_list) {
656 has = list_entry(cur, struct hv_hotadd_state, list);
657
658 /*
659 * If the pfn range we are dealing with is not in the current
660 * "hot add block", move on.
661 */
662 if ((start_pfn >= has->end_pfn))
663 continue;
664 /*
665 * If the current hot add-request extends beyond
666 * our current limit; extend it.
667 */
668 if ((start_pfn + pfn_cnt) > has->end_pfn) {
669 residual = (start_pfn + pfn_cnt - has->end_pfn);
670 /*
671 * Extend the region by multiples of HA_CHUNK.
672 */
673 new_inc = (residual / HA_CHUNK) * HA_CHUNK;
674 if (residual % HA_CHUNK)
675 new_inc += HA_CHUNK;
676
677 has->end_pfn += new_inc;
678 }
679
680 /*
681 * If the current start pfn is not where the covered_end
682 * is, update it.
683 */
684
685 if (has->covered_end_pfn != start_pfn) {
686 has->covered_end_pfn = start_pfn;
687 has->covered_start_pfn = start_pfn;
688 }
689 return true;
690
691 }
692
693 return false;
694}
695
696static unsigned long handle_pg_range(unsigned long pg_start,
697 unsigned long pg_count)
698{
699 unsigned long start_pfn = pg_start;
700 unsigned long pfn_cnt = pg_count;
701 unsigned long size;
702 struct list_head *cur;
703 struct hv_hotadd_state *has;
704 unsigned long pgs_ol = 0;
705 unsigned long old_covered_state;
706
707 if (list_empty(&dm_device.ha_region_list))
708 return 0;
709
710 list_for_each(cur, &dm_device.ha_region_list) {
711 has = list_entry(cur, struct hv_hotadd_state, list);
712
713 /*
714 * If the pfn range we are dealing with is not in the current
715 * "hot add block", move on.
716 */
717 if ((start_pfn >= has->end_pfn))
718 continue;
719
720 old_covered_state = has->covered_end_pfn;
721
722 if (start_pfn < has->ha_end_pfn) {
723 /*
724 * This is the case where we are backing pages
725 * in an already hot added region. Bring
726 * these pages online first.
727 */
728 pgs_ol = has->ha_end_pfn - start_pfn;
729 if (pgs_ol > pfn_cnt)
730 pgs_ol = pfn_cnt;
731 hv_bring_pgs_online(start_pfn, pgs_ol);
732 has->covered_end_pfn += pgs_ol;
733 has->covered_start_pfn += pgs_ol;
734 pfn_cnt -= pgs_ol;
735 }
736
737 if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) {
738 /*
739 * We have some residual hot add range
740 * that needs to be hot added; hot add
741 * it now. Hot add a multiple of
742 * of HA_CHUNK that fully covers the pages
743 * we have.
744 */
745 size = (has->end_pfn - has->ha_end_pfn);
746 if (pfn_cnt <= size) {
747 size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK);
748 if (pfn_cnt % HA_CHUNK)
749 size += HA_CHUNK;
750 } else {
751 pfn_cnt = size;
752 }
753 hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has);
754 }
755 /*
756 * If we managed to online any pages that were given to us,
757 * we declare success.
758 */
759 return has->covered_end_pfn - old_covered_state;
760
761 }
762
763 return 0;
764}
765
766static unsigned long process_hot_add(unsigned long pg_start,
767 unsigned long pfn_cnt,
768 unsigned long rg_start,
769 unsigned long rg_size)
770{
771 struct hv_hotadd_state *ha_region = NULL;
772
773 if (pfn_cnt == 0)
774 return 0;
775
776 if (!dm_device.host_specified_ha_region)
777 if (pfn_covered(pg_start, pfn_cnt))
778 goto do_pg_range;
779
780 /*
781 * If the host has specified a hot-add range; deal with it first.
782 */
783
K. Y. Srinivasan647965a2013-03-29 07:36:11 -0700784 if (rg_size != 0) {
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700785 ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL);
786 if (!ha_region)
787 return 0;
788
789 INIT_LIST_HEAD(&ha_region->list);
790
791 list_add_tail(&ha_region->list, &dm_device.ha_region_list);
792 ha_region->start_pfn = rg_start;
793 ha_region->ha_end_pfn = rg_start;
794 ha_region->covered_start_pfn = pg_start;
795 ha_region->covered_end_pfn = pg_start;
796 ha_region->end_pfn = rg_start + rg_size;
797 }
798
799do_pg_range:
800 /*
801 * Process the page range specified; bringing them
802 * online if possible.
803 */
804 return handle_pg_range(pg_start, pfn_cnt);
805}
806
807#endif
808
809static void hot_add_req(struct work_struct *dummy)
810{
811 struct dm_hot_add_response resp;
812#ifdef CONFIG_MEMORY_HOTPLUG
813 unsigned long pg_start, pfn_cnt;
814 unsigned long rg_start, rg_sz;
815#endif
816 struct hv_dynmem_device *dm = &dm_device;
817
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800818 memset(&resp, 0, sizeof(struct dm_hot_add_response));
819 resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE;
820 resp.hdr.size = sizeof(struct dm_hot_add_response);
821 resp.hdr.trans_id = atomic_inc_return(&trans_id);
822
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700823#ifdef CONFIG_MEMORY_HOTPLUG
824 pg_start = dm->ha_wrk.ha_page_range.finfo.start_page;
825 pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800826
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -0700827 rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
828 rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
829
830 if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
831 unsigned long region_size;
832 unsigned long region_start;
833
834 /*
835 * The host has not specified the hot-add region.
836 * Based on the hot-add page range being specified,
837 * compute a hot-add region that can cover the pages
838 * that need to be hot-added while ensuring the alignment
839 * and size requirements of Linux as it relates to hot-add.
840 */
841 region_start = pg_start;
842 region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
843 if (pfn_cnt % HA_CHUNK)
844 region_size += HA_CHUNK;
845
846 region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
847
848 rg_start = region_start;
849 rg_sz = region_size;
850 }
851
852 resp.page_count = process_hot_add(pg_start, pfn_cnt,
853 rg_start, rg_sz);
854#endif
855 if (resp.page_count > 0)
856 resp.result = 1;
857 else
858 resp.result = 0;
859
860 if (!do_hot_add || (resp.page_count == 0))
861 pr_info("Memory hot add failed\n");
862
863 dm->state = DM_INITIALIZED;
864 vmbus_sendpacket(dm->dev->channel, &resp,
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800865 sizeof(struct dm_hot_add_response),
866 (unsigned long)NULL,
867 VM_PKT_DATA_INBAND, 0);
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800868}
869
870static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg)
871{
K. Y. Srinivasan6427a0d2012-12-06 11:06:54 -0800872 struct dm_info_header *info_hdr;
873
874 info_hdr = (struct dm_info_header *)msg->info;
875
876 switch (info_hdr->type) {
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800877 case INFO_TYPE_MAX_PAGE_CNT:
878 pr_info("Received INFO_TYPE_MAX_PAGE_CNT\n");
K. Y. Srinivasan6427a0d2012-12-06 11:06:54 -0800879 pr_info("Data Size is %d\n", info_hdr->data_size);
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800880 break;
881 default:
K. Y. Srinivasan6427a0d2012-12-06 11:06:54 -0800882 pr_info("Received Unknown type: %d\n", info_hdr->type);
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800883 }
884}
885
Wei Yongjuna6025a22013-03-20 23:25:59 +0800886static unsigned long compute_balloon_floor(void)
K. Y. Srinivasan1c7db962013-02-08 15:57:16 -0800887{
888 unsigned long min_pages;
889#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
890 /* Simple continuous piecewiese linear function:
891 * max MiB -> min MiB gradient
892 * 0 0
893 * 16 16
894 * 32 24
895 * 128 72 (1/2)
896 * 512 168 (1/4)
897 * 2048 360 (1/8)
898 * 8192 552 (1/32)
899 * 32768 1320
900 * 131072 4392
901 */
902 if (totalram_pages < MB2PAGES(128))
903 min_pages = MB2PAGES(8) + (totalram_pages >> 1);
904 else if (totalram_pages < MB2PAGES(512))
905 min_pages = MB2PAGES(40) + (totalram_pages >> 2);
906 else if (totalram_pages < MB2PAGES(2048))
907 min_pages = MB2PAGES(104) + (totalram_pages >> 3);
908 else
909 min_pages = MB2PAGES(296) + (totalram_pages >> 5);
910#undef MB2PAGES
911 return min_pages;
912}
913
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800914/*
915 * Post our status as it relates memory pressure to the
916 * host. Host expects the guests to post this status
917 * periodically at 1 second intervals.
918 *
919 * The metrics specified in this protocol are very Windows
920 * specific and so we cook up numbers here to convey our memory
921 * pressure.
922 */
923
924static void post_status(struct hv_dynmem_device *dm)
925{
926 struct dm_status status;
K. Y. Srinivasan07315722013-01-25 16:18:47 -0800927 struct sysinfo val;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800928
K. Y. Srinivasane500d152013-02-08 15:57:15 -0800929 if (pressure_report_delay > 0) {
930 --pressure_report_delay;
931 return;
932 }
K. Y. Srinivasan07315722013-01-25 16:18:47 -0800933 si_meminfo(&val);
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800934 memset(&status, 0, sizeof(struct dm_status));
935 status.hdr.type = DM_STATUS_REPORT;
936 status.hdr.size = sizeof(struct dm_status);
937 status.hdr.trans_id = atomic_inc_return(&trans_id);
938
K. Y. Srinivasan07315722013-01-25 16:18:47 -0800939 /*
940 * The host expects the guest to report free memory.
941 * Further, the host expects the pressure information to
942 * include the ballooned out pages.
K. Y. Srinivasan1c7db962013-02-08 15:57:16 -0800943 * For a given amount of memory that we are managing, we
944 * need to compute a floor below which we should not balloon.
945 * Compute this and add it to the pressure report.
K. Y. Srinivasan07315722013-01-25 16:18:47 -0800946 */
947 status.num_avail = val.freeram;
K. Y. Srinivasan1c7db962013-02-08 15:57:16 -0800948 status.num_committed = vm_memory_committed() +
949 dm->num_pages_ballooned +
950 compute_balloon_floor();
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800951
952 vmbus_sendpacket(dm->dev->channel, &status,
953 sizeof(struct dm_status),
954 (unsigned long)NULL,
955 VM_PKT_DATA_INBAND, 0);
956
957}
958
Greg Kroah-Hartman989623c2012-11-21 12:46:40 -0800959static void free_balloon_pages(struct hv_dynmem_device *dm,
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -0800960 union dm_mem_page_range *range_array)
961{
962 int num_pages = range_array->finfo.page_cnt;
963 __u64 start_frame = range_array->finfo.start_page;
964 struct page *pg;
965 int i;
966
967 for (i = 0; i < num_pages; i++) {
968 pg = pfn_to_page(i + start_frame);
969 __free_page(pg);
970 dm->num_pages_ballooned--;
971 }
972}
973
974
975
976static int alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages,
977 struct dm_balloon_response *bl_resp, int alloc_unit,
978 bool *alloc_error)
979{
980 int i = 0;
981 struct page *pg;
982
983 if (num_pages < alloc_unit)
984 return 0;
985
986 for (i = 0; (i * alloc_unit) < num_pages; i++) {
987 if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
988 PAGE_SIZE)
989 return i * alloc_unit;
990
991 /*
992 * We execute this code in a thread context. Furthermore,
993 * we don't want the kernel to try too hard.
994 */
995 pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY |
996 __GFP_NOMEMALLOC | __GFP_NOWARN,
997 get_order(alloc_unit << PAGE_SHIFT));
998
999 if (!pg) {
1000 *alloc_error = true;
1001 return i * alloc_unit;
1002 }
1003
1004
1005 dm->num_pages_ballooned += alloc_unit;
1006
K. Y. Srinivasanf766dc12013-03-18 13:51:37 -07001007 /*
1008 * If we allocatted 2M pages; split them so we
1009 * can free them in any order we get.
1010 */
1011
1012 if (alloc_unit != 1)
1013 split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
1014
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001015 bl_resp->range_count++;
1016 bl_resp->range_array[i].finfo.start_page =
1017 page_to_pfn(pg);
1018 bl_resp->range_array[i].finfo.page_cnt = alloc_unit;
1019 bl_resp->hdr.size += sizeof(union dm_mem_page_range);
1020
1021 }
1022
1023 return num_pages;
1024}
1025
1026
1027
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -07001028static void balloon_up(struct work_struct *dummy)
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001029{
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -07001030 int num_pages = dm_device.balloon_wrk.num_pages;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001031 int num_ballooned = 0;
1032 struct dm_balloon_response *bl_resp;
1033 int alloc_unit;
1034 int ret;
1035 bool alloc_error = false;
1036 bool done = false;
1037 int i;
1038
1039
1040 /*
K. Y. Srinivasanf766dc12013-03-18 13:51:37 -07001041 * We will attempt 2M allocations. However, if we fail to
1042 * allocate 2M chunks, we will go back to 4k allocations.
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001043 */
K. Y. Srinivasanf766dc12013-03-18 13:51:37 -07001044 alloc_unit = 512;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001045
1046 while (!done) {
1047 bl_resp = (struct dm_balloon_response *)send_buffer;
1048 memset(send_buffer, 0, PAGE_SIZE);
1049 bl_resp->hdr.type = DM_BALLOON_RESPONSE;
1050 bl_resp->hdr.trans_id = atomic_inc_return(&trans_id);
1051 bl_resp->hdr.size = sizeof(struct dm_balloon_response);
1052 bl_resp->more_pages = 1;
1053
1054
1055 num_pages -= num_ballooned;
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -07001056 num_ballooned = alloc_balloon_pages(&dm_device, num_pages,
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001057 bl_resp, alloc_unit,
1058 &alloc_error);
1059
K. Y. Srinivasanf766dc12013-03-18 13:51:37 -07001060 if ((alloc_error) && (alloc_unit != 1)) {
1061 alloc_unit = 1;
1062 continue;
1063 }
1064
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001065 if ((alloc_error) || (num_ballooned == num_pages)) {
1066 bl_resp->more_pages = 0;
1067 done = true;
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -07001068 dm_device.state = DM_INITIALIZED;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001069 }
1070
1071 /*
1072 * We are pushing a lot of data through the channel;
1073 * deal with transient failures caused because of the
1074 * lack of space in the ring buffer.
1075 */
1076
1077 do {
1078 ret = vmbus_sendpacket(dm_device.dev->channel,
1079 bl_resp,
1080 bl_resp->hdr.size,
1081 (unsigned long)NULL,
1082 VM_PKT_DATA_INBAND, 0);
1083
1084 if (ret == -EAGAIN)
1085 msleep(20);
1086
1087 } while (ret == -EAGAIN);
1088
1089 if (ret) {
1090 /*
1091 * Free up the memory we allocatted.
1092 */
1093 pr_info("Balloon response failed\n");
1094
1095 for (i = 0; i < bl_resp->range_count; i++)
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -07001096 free_balloon_pages(&dm_device,
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001097 &bl_resp->range_array[i]);
1098
1099 done = true;
1100 }
1101 }
1102
1103}
1104
1105static void balloon_down(struct hv_dynmem_device *dm,
1106 struct dm_unballoon_request *req)
1107{
1108 union dm_mem_page_range *range_array = req->range_array;
1109 int range_count = req->range_count;
1110 struct dm_unballoon_response resp;
1111 int i;
1112
1113 for (i = 0; i < range_count; i++)
1114 free_balloon_pages(dm, &range_array[i]);
1115
1116 if (req->more_pages == 1)
1117 return;
1118
1119 memset(&resp, 0, sizeof(struct dm_unballoon_response));
1120 resp.hdr.type = DM_UNBALLOON_RESPONSE;
1121 resp.hdr.trans_id = atomic_inc_return(&trans_id);
1122 resp.hdr.size = sizeof(struct dm_unballoon_response);
1123
1124 vmbus_sendpacket(dm_device.dev->channel, &resp,
1125 sizeof(struct dm_unballoon_response),
1126 (unsigned long)NULL,
1127 VM_PKT_DATA_INBAND, 0);
1128
1129 dm->state = DM_INITIALIZED;
1130}
1131
1132static void balloon_onchannelcallback(void *context);
1133
1134static int dm_thread_func(void *dm_dev)
1135{
1136 struct hv_dynmem_device *dm = dm_dev;
1137 int t;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001138
1139 while (!kthread_should_stop()) {
1140 t = wait_for_completion_timeout(&dm_device.config_event, 1*HZ);
1141 /*
1142 * The host expects us to post information on the memory
1143 * pressure every second.
1144 */
1145
1146 if (t == 0)
1147 post_status(dm);
1148
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001149 }
1150
1151 return 0;
1152}
1153
1154
1155static void version_resp(struct hv_dynmem_device *dm,
1156 struct dm_version_response *vresp)
1157{
1158 struct dm_version_request version_req;
1159 int ret;
1160
1161 if (vresp->is_accepted) {
1162 /*
1163 * We are done; wakeup the
1164 * context waiting for version
1165 * negotiation.
1166 */
1167 complete(&dm->host_event);
1168 return;
1169 }
1170 /*
1171 * If there are more versions to try, continue
1172 * with negotiations; if not
1173 * shutdown the service since we are not able
1174 * to negotiate a suitable version number
1175 * with the host.
1176 */
1177 if (dm->next_version == 0)
1178 goto version_error;
1179
1180 dm->next_version = 0;
1181 memset(&version_req, 0, sizeof(struct dm_version_request));
1182 version_req.hdr.type = DM_VERSION_REQUEST;
1183 version_req.hdr.size = sizeof(struct dm_version_request);
1184 version_req.hdr.trans_id = atomic_inc_return(&trans_id);
1185 version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN7;
1186 version_req.is_last_attempt = 1;
1187
1188 ret = vmbus_sendpacket(dm->dev->channel, &version_req,
1189 sizeof(struct dm_version_request),
1190 (unsigned long)NULL,
1191 VM_PKT_DATA_INBAND, 0);
1192
1193 if (ret)
1194 goto version_error;
1195
1196 return;
1197
1198version_error:
1199 dm->state = DM_INIT_ERROR;
1200 complete(&dm->host_event);
1201}
1202
1203static void cap_resp(struct hv_dynmem_device *dm,
1204 struct dm_capabilities_resp_msg *cap_resp)
1205{
1206 if (!cap_resp->is_accepted) {
1207 pr_info("Capabilities not accepted by host\n");
1208 dm->state = DM_INIT_ERROR;
1209 }
1210 complete(&dm->host_event);
1211}
1212
1213static void balloon_onchannelcallback(void *context)
1214{
1215 struct hv_device *dev = context;
1216 u32 recvlen;
1217 u64 requestid;
1218 struct dm_message *dm_msg;
1219 struct dm_header *dm_hdr;
1220 struct hv_dynmem_device *dm = hv_get_drvdata(dev);
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -07001221 struct dm_balloon *bal_msg;
K. Y. Srinivasanc51af822013-03-15 12:25:41 -07001222 struct dm_hot_add *ha_msg;
1223 union dm_mem_page_range *ha_pg_range;
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -07001224 union dm_mem_page_range *ha_region;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001225
1226 memset(recv_buffer, 0, sizeof(recv_buffer));
1227 vmbus_recvpacket(dev->channel, recv_buffer,
1228 PAGE_SIZE, &recvlen, &requestid);
1229
1230 if (recvlen > 0) {
1231 dm_msg = (struct dm_message *)recv_buffer;
1232 dm_hdr = &dm_msg->hdr;
1233
1234 switch (dm_hdr->type) {
1235 case DM_VERSION_RESPONSE:
1236 version_resp(dm,
1237 (struct dm_version_response *)dm_msg);
1238 break;
1239
1240 case DM_CAPABILITIES_RESPONSE:
1241 cap_resp(dm,
1242 (struct dm_capabilities_resp_msg *)dm_msg);
1243 break;
1244
1245 case DM_BALLOON_REQUEST:
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -07001246 if (dm->state == DM_BALLOON_UP)
1247 pr_warn("Currently ballooning\n");
1248 bal_msg = (struct dm_balloon *)recv_buffer;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001249 dm->state = DM_BALLOON_UP;
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -07001250 dm_device.balloon_wrk.num_pages = bal_msg->num_pages;
1251 schedule_work(&dm_device.balloon_wrk.wrk);
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001252 break;
1253
1254 case DM_UNBALLOON_REQUEST:
1255 dm->state = DM_BALLOON_DOWN;
1256 balloon_down(dm,
1257 (struct dm_unballoon_request *)recv_buffer);
1258 break;
1259
1260 case DM_MEM_HOT_ADD_REQUEST:
K. Y. Srinivasanc51af822013-03-15 12:25:41 -07001261 if (dm->state == DM_HOT_ADD)
1262 pr_warn("Currently hot-adding\n");
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001263 dm->state = DM_HOT_ADD;
K. Y. Srinivasanc51af822013-03-15 12:25:41 -07001264 ha_msg = (struct dm_hot_add *)recv_buffer;
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -07001265 if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) {
1266 /*
1267 * This is a normal hot-add request specifying
1268 * hot-add memory.
1269 */
1270 ha_pg_range = &ha_msg->range;
1271 dm->ha_wrk.ha_page_range = *ha_pg_range;
1272 dm->ha_wrk.ha_region_range.page_range = 0;
1273 } else {
1274 /*
1275 * Host is specifying that we first hot-add
1276 * a region and then partially populate this
1277 * region.
1278 */
1279 dm->host_specified_ha_region = true;
1280 ha_pg_range = &ha_msg->range;
1281 ha_region = &ha_pg_range[1];
1282 dm->ha_wrk.ha_page_range = *ha_pg_range;
1283 dm->ha_wrk.ha_region_range = *ha_region;
1284 }
K. Y. Srinivasanc51af822013-03-15 12:25:41 -07001285 schedule_work(&dm_device.ha_wrk.wrk);
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001286 break;
1287
1288 case DM_INFO_MESSAGE:
1289 process_info(dm, (struct dm_info_msg *)dm_msg);
1290 break;
1291
1292 default:
1293 pr_err("Unhandled message: type: %d\n", dm_hdr->type);
1294
1295 }
1296 }
1297
1298}
1299
1300static int balloon_probe(struct hv_device *dev,
1301 const struct hv_vmbus_device_id *dev_id)
1302{
1303 int ret, t;
1304 struct dm_version_request version_req;
1305 struct dm_capabilities cap_msg;
1306
1307 do_hot_add = hot_add;
1308
1309 /*
1310 * First allocate a send buffer.
1311 */
1312
1313 send_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
1314 if (!send_buffer)
1315 return -ENOMEM;
1316
1317 ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0,
1318 balloon_onchannelcallback, dev);
1319
1320 if (ret)
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001321 goto probe_error0;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001322
1323 dm_device.dev = dev;
1324 dm_device.state = DM_INITIALIZING;
1325 dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
1326 init_completion(&dm_device.host_event);
1327 init_completion(&dm_device.config_event);
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -07001328 INIT_LIST_HEAD(&dm_device.ha_region_list);
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -07001329 INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
K. Y. Srinivasanc51af822013-03-15 12:25:41 -07001330 INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -07001331 dm_device.host_specified_ha_region = false;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001332
1333 dm_device.thread =
1334 kthread_run(dm_thread_func, &dm_device, "hv_balloon");
1335 if (IS_ERR(dm_device.thread)) {
1336 ret = PTR_ERR(dm_device.thread);
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001337 goto probe_error1;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001338 }
1339
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -07001340#ifdef CONFIG_MEMORY_HOTPLUG
1341 set_online_page_callback(&hv_online_page);
1342#endif
1343
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001344 hv_set_drvdata(dev, &dm_device);
1345 /*
1346 * Initiate the hand shake with the host and negotiate
1347 * a version that the host can support. We start with the
1348 * highest version number and go down if the host cannot
1349 * support it.
1350 */
1351 memset(&version_req, 0, sizeof(struct dm_version_request));
1352 version_req.hdr.type = DM_VERSION_REQUEST;
1353 version_req.hdr.size = sizeof(struct dm_version_request);
1354 version_req.hdr.trans_id = atomic_inc_return(&trans_id);
1355 version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN8;
1356 version_req.is_last_attempt = 0;
1357
1358 ret = vmbus_sendpacket(dev->channel, &version_req,
1359 sizeof(struct dm_version_request),
1360 (unsigned long)NULL,
K. Y. Srinivasan7a64b862013-03-15 12:25:39 -07001361 VM_PKT_DATA_INBAND, 0);
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001362 if (ret)
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001363 goto probe_error2;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001364
1365 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
1366 if (t == 0) {
1367 ret = -ETIMEDOUT;
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001368 goto probe_error2;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001369 }
1370
1371 /*
1372 * If we could not negotiate a compatible version with the host
1373 * fail the probe function.
1374 */
1375 if (dm_device.state == DM_INIT_ERROR) {
1376 ret = -ETIMEDOUT;
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001377 goto probe_error2;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001378 }
1379 /*
1380 * Now submit our capabilities to the host.
1381 */
1382 memset(&cap_msg, 0, sizeof(struct dm_capabilities));
1383 cap_msg.hdr.type = DM_CAPABILITIES_REPORT;
1384 cap_msg.hdr.size = sizeof(struct dm_capabilities);
1385 cap_msg.hdr.trans_id = atomic_inc_return(&trans_id);
1386
1387 cap_msg.caps.cap_bits.balloon = 1;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001388 cap_msg.caps.cap_bits.hot_add = 1;
1389
1390 /*
K. Y. Srinivasan647965a2013-03-29 07:36:11 -07001391 * Specify our alignment requirements as it relates
1392 * memory hot-add. Specify 128MB alignment.
1393 */
1394 cap_msg.caps.cap_bits.hot_add_alignment = 7;
1395
1396 /*
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001397 * Currently the host does not use these
1398 * values and we set them to what is done in the
1399 * Windows driver.
1400 */
1401 cap_msg.min_page_cnt = 0;
1402 cap_msg.max_page_number = -1;
1403
1404 ret = vmbus_sendpacket(dev->channel, &cap_msg,
1405 sizeof(struct dm_capabilities),
1406 (unsigned long)NULL,
K. Y. Srinivasan7a64b862013-03-15 12:25:39 -07001407 VM_PKT_DATA_INBAND, 0);
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001408 if (ret)
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001409 goto probe_error2;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001410
1411 t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
1412 if (t == 0) {
1413 ret = -ETIMEDOUT;
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001414 goto probe_error2;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001415 }
1416
1417 /*
1418 * If the host does not like our capabilities,
1419 * fail the probe function.
1420 */
1421 if (dm_device.state == DM_INIT_ERROR) {
1422 ret = -ETIMEDOUT;
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001423 goto probe_error2;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001424 }
1425
1426 dm_device.state = DM_INITIALIZED;
1427
1428 return 0;
1429
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001430probe_error2:
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -07001431#ifdef CONFIG_MEMORY_HOTPLUG
1432 restore_online_page_callback(&hv_online_page);
1433#endif
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001434 kthread_stop(dm_device.thread);
1435
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001436probe_error1:
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001437 vmbus_close(dev->channel);
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001438probe_error0:
1439 kfree(send_buffer);
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001440 return ret;
1441}
1442
1443static int balloon_remove(struct hv_device *dev)
1444{
1445 struct hv_dynmem_device *dm = hv_get_drvdata(dev);
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -07001446 struct list_head *cur, *tmp;
1447 struct hv_hotadd_state *has;
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001448
1449 if (dm->num_pages_ballooned != 0)
1450 pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
1451
K. Y. Srinivasan6571b2d2013-03-15 12:25:40 -07001452 cancel_work_sync(&dm->balloon_wrk.wrk);
K. Y. Srinivasanc51af822013-03-15 12:25:41 -07001453 cancel_work_sync(&dm->ha_wrk.wrk);
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -07001454
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001455 vmbus_close(dev->channel);
1456 kthread_stop(dm->thread);
K. Y. Srinivasan33080c12012-12-11 11:07:17 -08001457 kfree(send_buffer);
K. Y. Srinivasan1cac8cd2013-03-15 12:25:43 -07001458#ifdef CONFIG_MEMORY_HOTPLUG
1459 restore_online_page_callback(&hv_online_page);
1460#endif
1461 list_for_each_safe(cur, tmp, &dm->ha_region_list) {
1462 has = list_entry(cur, struct hv_hotadd_state, list);
1463 list_del(&has->list);
1464 kfree(has);
1465 }
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001466
1467 return 0;
1468}
1469
1470static const struct hv_vmbus_device_id id_table[] = {
1471 /* Dynamic Memory Class ID */
1472 /* 525074DC-8985-46e2-8057-A307DC18A502 */
K. Y. Srinivasand13984e2013-01-23 17:42:41 -08001473 { HV_DM_GUID, },
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001474 { },
1475};
1476
1477MODULE_DEVICE_TABLE(vmbus, id_table);
1478
1479static struct hv_driver balloon_drv = {
1480 .name = "hv_balloon",
1481 .id_table = id_table,
1482 .probe = balloon_probe,
1483 .remove = balloon_remove,
1484};
1485
1486static int __init init_balloon_drv(void)
1487{
1488
1489 return vmbus_driver_register(&balloon_drv);
1490}
1491
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001492module_init(init_balloon_drv);
K. Y. Srinivasan9aa8b502012-11-14 01:09:02 -08001493
1494MODULE_DESCRIPTION("Hyper-V Balloon");
1495MODULE_VERSION(HV_DRV_VERSION);
1496MODULE_LICENSE("GPL");