py: Move locals/globals dicts to the thread-specific state.

Each threads needs to have its own private references to its current
locals/globals dicts, otherwise functions running within different
contexts (eg imported from different files) can behave very strangely.
diff --git a/py/modthread.c b/py/modthread.c
index 975e7d1..dda9e93 100644
--- a/py/modthread.c
+++ b/py/modthread.c
@@ -143,6 +143,8 @@
 STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_thread_stack_size_obj, 0, 1, mod_thread_stack_size);
 
 typedef struct _thread_entry_args_t {
+    mp_obj_dict_t *dict_locals;
+    mp_obj_dict_t *dict_globals;
     size_t stack_size;
     mp_obj_t fun;
     size_t n_args;
@@ -161,6 +163,10 @@
     mp_stack_set_top(&ts + 1); // need to include ts in root-pointer scan
     mp_stack_set_limit(args->stack_size);
 
+    // set locals and globals from the calling context
+    mp_locals_set(args->dict_locals);
+    mp_globals_set(args->dict_globals);
+
     MP_THREAD_GIL_ENTER();
 
     // signal that we are set up and running
@@ -169,7 +175,6 @@
     // TODO set more thread-specific state here:
     //  mp_pending_exception? (root pointer)
     //  cur_exception (root pointer)
-    //  dict_locals? (root pointer) uPy doesn't make a new locals dict for functions, just for classes, so it's different to CPy
 
     DEBUG_printf("[thread] start ts=%p args=%p stack=%p\n", &ts, &args, MP_STATE_THREAD(stack_top));
 
@@ -240,6 +245,10 @@
     th_args->n_args = pos_args_len;
     memcpy(th_args->args, pos_args_items, pos_args_len * sizeof(mp_obj_t));
 
+    // pass our locals and globals into the new thread
+    th_args->dict_locals = mp_locals_get();
+    th_args->dict_globals = mp_globals_get();
+
     // set the stack size to use
     th_args->stack_size = thread_stack_size;
 
diff --git a/py/mpstate.h b/py/mpstate.h
index 54392a9..9e4f54a 100644
--- a/py/mpstate.h
+++ b/py/mpstate.h
@@ -196,11 +196,13 @@
 // This structure holds state that is specific to a given thread.
 // Everything in this structure is scanned for root pointers.
 typedef struct _mp_state_thread_t {
+    mp_obj_dict_t *dict_locals;
+    mp_obj_dict_t *dict_globals;
+
     // Note: nlr asm code has the offset of this hard-coded
     nlr_buf_t *nlr_top; // ROOT POINTER
 
     // Stack top at the start of program
-    // Note: this entry is used to locate the end of the root pointer section.
     char *stack_top;
 
     #if MICROPY_STACK_CHECK
@@ -208,15 +210,11 @@
     #endif
 } mp_state_thread_t;
 
-// This structure combines the above 3 structures, and adds the local
-// and global dicts.
+// This structure combines the above 3 structures.
+// The order of the entries are important for root pointer scanning in the GC to work.
 // Note: if this structure changes then revisit all nlr asm code since they
 // have the offset of nlr_top hard-coded.
 typedef struct _mp_state_ctx_t {
-    // these must come first for root pointer scanning in GC to work
-    mp_obj_dict_t *dict_locals;
-    mp_obj_dict_t *dict_globals;
-    // these must come next in this order for root pointer scanning in GC to work
     mp_state_thread_t thread;
     mp_state_vm_t vm;
     mp_state_mem_t mem;
@@ -224,7 +222,7 @@
 
 extern mp_state_ctx_t mp_state_ctx;
 
-#define MP_STATE_CTX(x) (mp_state_ctx.x)
+#define MP_STATE_CTX(x) MP_STATE_THREAD(x)
 #define MP_STATE_VM(x) (mp_state_ctx.vm.x)
 #define MP_STATE_MEM(x) (mp_state_ctx.mem.x)
 
diff --git a/py/nlrx64.S b/py/nlrx64.S
index 78d6545..610456a 100644
--- a/py/nlrx64.S
+++ b/py/nlrx64.S
@@ -44,7 +44,7 @@
 #endif
 
 // offset of nlr_top within mp_state_thread_t structure
-#define NLR_TOP_TH_OFF (0)
+#define NLR_TOP_TH_OFF (2 * 8)
 
 #if defined(_WIN32) || defined(__CYGWIN__)
 #define NLR_OS_WINDOWS
diff --git a/py/nlrx86.S b/py/nlrx86.S
index ff0cc21..16d6f57 100644
--- a/py/nlrx86.S
+++ b/py/nlrx86.S
@@ -52,7 +52,7 @@
 #endif
 
 // offset of nlr_top within mp_state_thread_t structure
-#define NLR_TOP_TH_OFF (0)
+#define NLR_TOP_TH_OFF (2 * 4)
 
     .file   "nlr.s"
     .text