stmhal: Initialise stack pointer correctly.

Stack is full descending and must be 8-byte aligned.  It must start off
pointing to just above the last byte of RAM.

Previously, stack started pointed to last byte of RAM (eg 0x2001ffff)
and so was not 8-byte aligned.  This caused a bug in combination with
alloca.

This patch also updates some debug printing code.

Addresses issue #872 (among many other undiscovered issues).
diff --git a/py/builtinimport.c b/py/builtinimport.c
index ec29f52..9bcc074 100644
--- a/py/builtinimport.c
+++ b/py/builtinimport.c
@@ -167,11 +167,11 @@
 
 mp_obj_t mp_builtin___import__(mp_uint_t n_args, mp_obj_t *args) {
 #if DEBUG_PRINT
-    printf("__import__:\n");
+    DEBUG_printf("__import__:\n");
     for (int i = 0; i < n_args; i++) {
-        printf("  ");
+        DEBUG_printf("  ");
         mp_obj_print(args[i], PRINT_REPR);
-        printf("\n");
+        DEBUG_printf("\n");
     }
 #endif
 
@@ -199,9 +199,9 @@
         mp_obj_t this_name_q = mp_obj_dict_get(mp_globals_get(), MP_OBJ_NEW_QSTR(MP_QSTR___name__));
         assert(this_name_q != MP_OBJ_NULL);
 #if DEBUG_PRINT
-        printf("Current module: ");
+        DEBUG_printf("Current module: ");
         mp_obj_print(this_name_q, PRINT_REPR);
-        printf("\n");
+        DEBUG_printf("\n");
 #endif
 
         mp_uint_t this_name_l;
diff --git a/py/objfun.c b/py/objfun.c
index 8c08ce7..eb89a78 100644
--- a/py/objfun.c
+++ b/py/objfun.c
@@ -183,7 +183,7 @@
 STATIC mp_obj_t fun_bc_call(mp_obj_t self_in, mp_uint_t n_args, mp_uint_t n_kw, const mp_obj_t *args) {
     MP_STACK_CHECK();
 
-    DEBUG_printf("Input n_args: %d, n_kw: %d\n", n_args, n_kw);
+    DEBUG_printf("Input n_args: " UINT_FMT ", n_kw: " UINT_FMT "\n", n_args, n_kw);
     DEBUG_printf("Input pos args: ");
     dump_args(args, n_args);
     DEBUG_printf("Input kw args: ");
diff --git a/py/runtime.c b/py/runtime.c
index e84db4e..0db8093 100644
--- a/py/runtime.c
+++ b/py/runtime.c
@@ -190,7 +190,7 @@
 }
 
 mp_obj_t mp_unary_op(mp_uint_t op, mp_obj_t arg) {
-    DEBUG_OP_printf("unary %d %p\n", op, arg);
+    DEBUG_OP_printf("unary " UINT_FMT " %p\n", op, arg);
 
     if (MP_OBJ_IS_SMALL_INT(arg)) {
         mp_int_t val = MP_OBJ_SMALL_INT_VALUE(arg);
@@ -226,7 +226,7 @@
 }
 
 mp_obj_t mp_binary_op(mp_uint_t op, mp_obj_t lhs, mp_obj_t rhs) {
-    DEBUG_OP_printf("binary %d %p %p\n", op, lhs, rhs);
+    DEBUG_OP_printf("binary " UINT_FMT " %p %p\n", op, lhs, rhs);
 
     // TODO correctly distinguish inplace operators for mutable objects
     // lookup logic that CPython uses for +=:
@@ -523,7 +523,7 @@
     // TODO improve this: fun object can specify its type and we parse here the arguments,
     // passing to the function arrays of fixed and keyword arguments
 
-    DEBUG_OP_printf("calling function %p(n_args=%d, n_kw=%d, args=%p)\n", fun_in, n_args, n_kw, args);
+    DEBUG_OP_printf("calling function %p(n_args=" UINT_FMT ", n_kw=" UINT_FMT ", args=%p)\n", fun_in, n_args, n_kw, args);
 
     // get the type
     mp_obj_type_t *type = mp_obj_get_type(fun_in);
@@ -539,7 +539,7 @@
 // args contains: fun  self/NULL  arg(0)  ...  arg(n_args-2)  arg(n_args-1)  kw_key(0)  kw_val(0)  ... kw_key(n_kw-1)  kw_val(n_kw-1)
 // if n_args==0 and n_kw==0 then there are only fun and self/NULL
 mp_obj_t mp_call_method_n_kw(mp_uint_t n_args, mp_uint_t n_kw, const mp_obj_t *args) {
-    DEBUG_OP_printf("call method (fun=%p, self=%p, n_args=%u, n_kw=%u, args=%p)\n", args[0], args[1], n_args, n_kw, args);
+    DEBUG_OP_printf("call method (fun=%p, self=%p, n_args=" UINT_FMT ", n_kw=" UINT_FMT ", args=%p)\n", args[0], args[1], n_args, n_kw, args);
     int adjust = (args[1] == NULL) ? 0 : 1;
     return mp_call_function_n_kw(args[0], n_args + adjust, n_kw, args + 2 - adjust);
 }
@@ -732,7 +732,7 @@
 void mp_unpack_ex(mp_obj_t seq_in, mp_uint_t num_in, mp_obj_t *items) {
     mp_uint_t num_left = num_in & 0xff;
     mp_uint_t num_right = (num_in >> 8) & 0xff;
-    DEBUG_OP_printf("unpack ex %d %d\n", num_left, num_right);
+    DEBUG_OP_printf("unpack ex " UINT_FMT " " UINT_FMT "\n", num_left, num_right);
     mp_uint_t seq_len;
     if (MP_OBJ_IS_TYPE(seq_in, &mp_type_tuple) || MP_OBJ_IS_TYPE(seq_in, &mp_type_list)) {
         mp_obj_t *seq_items;
diff --git a/stmhal/main.c b/stmhal/main.c
index 6aa7b08..05b097b 100644
--- a/stmhal/main.c
+++ b/stmhal/main.c
@@ -174,9 +174,9 @@
 int main(void) {
     // TODO disable JTAG
 
-    // Stack limit should be less than real stack size, so we
-    // had chance to recover from limit hit.
-    mp_stack_set_limit(&_ram_end - &_heap_end - 512);
+    // Stack limit should be less than real stack size, so we have a chance
+    // to recover from limit hit.  (Limit is measured in bytes.)
+    mp_stack_set_limit((char*)&_ram_end - (char*)&_heap_end - 1024);
 
     /* STM32F4xx HAL library initialization:
          - Configure the Flash prefetch, instruction and Data caches
diff --git a/stmhal/stm32f405.ld b/stmhal/stm32f405.ld
index 9d9d4d5..5af1d32 100644
--- a/stmhal/stm32f405.ld
+++ b/stmhal/stm32f405.ld
@@ -16,10 +16,10 @@
 _minimum_stack_size = 2K;
 _minimum_heap_size = 16K;
  
-/* top end of the stack */
-
-/*_stack_end = ORIGIN(RAM) + LENGTH(RAM);*/
-_estack = ORIGIN(RAM) + LENGTH(RAM) - 1;
+/* Define tho top end of the stack.  The stack is full descending so begins just
+   above last byte of RAM.  Note that EABI requires the stack to be 8-byte
+   aligned for a call. */
+_estack = ORIGIN(RAM) + LENGTH(RAM);
 
 /* RAM extents for the garbage collector */
 _ram_end = ORIGIN(RAM) + LENGTH(RAM);