py: Move locals/globals dicts to the thread-specific state.

Each threads needs to have its own private references to its current
locals/globals dicts, otherwise functions running within different
contexts (eg imported from different files) can behave very strangely.
diff --git a/py/modthread.c b/py/modthread.c
index 975e7d1..dda9e93 100644
--- a/py/modthread.c
+++ b/py/modthread.c
@@ -143,6 +143,8 @@
 STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_thread_stack_size_obj, 0, 1, mod_thread_stack_size);
 
 typedef struct _thread_entry_args_t {
+    mp_obj_dict_t *dict_locals;
+    mp_obj_dict_t *dict_globals;
     size_t stack_size;
     mp_obj_t fun;
     size_t n_args;
@@ -161,6 +163,10 @@
     mp_stack_set_top(&ts + 1); // need to include ts in root-pointer scan
     mp_stack_set_limit(args->stack_size);
 
+    // set locals and globals from the calling context
+    mp_locals_set(args->dict_locals);
+    mp_globals_set(args->dict_globals);
+
     MP_THREAD_GIL_ENTER();
 
     // signal that we are set up and running
@@ -169,7 +175,6 @@
     // TODO set more thread-specific state here:
     //  mp_pending_exception? (root pointer)
     //  cur_exception (root pointer)
-    //  dict_locals? (root pointer) uPy doesn't make a new locals dict for functions, just for classes, so it's different to CPy
 
     DEBUG_printf("[thread] start ts=%p args=%p stack=%p\n", &ts, &args, MP_STATE_THREAD(stack_top));
 
@@ -240,6 +245,10 @@
     th_args->n_args = pos_args_len;
     memcpy(th_args->args, pos_args_items, pos_args_len * sizeof(mp_obj_t));
 
+    // pass our locals and globals into the new thread
+    th_args->dict_locals = mp_locals_get();
+    th_args->dict_globals = mp_globals_get();
+
     // set the stack size to use
     th_args->stack_size = thread_stack_size;