py: Added optimised support for 3-argument calls to builtin.pow()

Updated modbuiltin.c to add conditional support for 3-arg calls to
pow() using MICROPY_PY_BUILTINS_POW3 config parameter. Added support in
objint_mpz.c for for optimised implementation.
diff --git a/py/modbuiltins.c b/py/modbuiltins.c
index f62afd8..a0c6893 100644
--- a/py/modbuiltins.c
+++ b/py/modbuiltins.c
@@ -378,7 +378,14 @@
 STATIC mp_obj_t mp_builtin_pow(size_t n_args, const mp_obj_t *args) {
     switch (n_args) {
         case 2: return mp_binary_op(MP_BINARY_OP_POWER, args[0], args[1]);
-        default: return mp_binary_op(MP_BINARY_OP_MODULO, mp_binary_op(MP_BINARY_OP_POWER, args[0], args[1]), args[2]); // TODO optimise...
+        default:
+#if !MICROPY_PY_BUILTINS_POW3
+            mp_raise_msg(&mp_type_NotImplementedError, "3-arg pow() not supported");
+#elif MICROPY_LONGINT_IMPL != MICROPY_LONGINT_IMPL_MPZ
+            return mp_binary_op(MP_BINARY_OP_MODULO, mp_binary_op(MP_BINARY_OP_POWER, args[0], args[1]), args[2]);
+#else
+            return mp_obj_int_pow3(args[0], args[1], args[2]);
+#endif
     }
 }
 MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mp_builtin_pow_obj, 2, 3, mp_builtin_pow);
diff --git a/py/mpconfig.h b/py/mpconfig.h
index 993ad1d..13af4c6 100644
--- a/py/mpconfig.h
+++ b/py/mpconfig.h
@@ -490,6 +490,11 @@
 #define MICROPY_LONGINT_IMPL (MICROPY_LONGINT_IMPL_NONE)
 #endif
 
+// Support for calls to pow() with 3 integer arguments
+#ifndef MICROPY_PY_BUILTINS_POW3
+#define MICROPY_PY_BUILTINS_POW3 (0)
+#endif
+
 #if MICROPY_LONGINT_IMPL == MICROPY_LONGINT_IMPL_LONGLONG
 typedef long long mp_longint_impl_t;
 #endif
diff --git a/py/mpz.c b/py/mpz.c
index 6477c3f..230eb92 100644
--- a/py/mpz.c
+++ b/py/mpz.c
@@ -1395,9 +1395,6 @@
     mpz_free(n);
 }
 
-#if 0
-these functions are unused
-
 /* computes dest = (lhs ** rhs) % mod
    can have dest, lhs, rhs the same; mod can't be the same as dest
 */
@@ -1436,6 +1433,9 @@
     mpz_free(n);
 }
 
+#if 0
+these functions are unused
+
 /* computes gcd(z1, z2)
    based on Knuth's modified gcd algorithm (I think?)
    gcd(z1, z2) >= 0
diff --git a/py/mpz.h b/py/mpz.h
index a26cbea..8facb1a 100644
--- a/py/mpz.h
+++ b/py/mpz.h
@@ -123,6 +123,7 @@
 void mpz_sub_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs);
 void mpz_mul_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs);
 void mpz_pow_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs);
+void mpz_pow3_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs, const mpz_t *mod);
 void mpz_and_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs);
 void mpz_or_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs);
 void mpz_xor_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs);
diff --git a/py/objint.h b/py/objint.h
index a84a33f..7205761 100644
--- a/py/objint.h
+++ b/py/objint.h
@@ -66,5 +66,6 @@
 mp_obj_t mp_obj_int_unary_op(mp_uint_t op, mp_obj_t o_in);
 mp_obj_t mp_obj_int_binary_op(mp_uint_t op, mp_obj_t lhs_in, mp_obj_t rhs_in);
 mp_obj_t mp_obj_int_binary_op_extra_cases(mp_uint_t op, mp_obj_t lhs_in, mp_obj_t rhs_in);
+mp_obj_t mp_obj_int_pow3(mp_obj_t base, mp_obj_t exponent,  mp_obj_t modulus);
 
 #endif // __MICROPY_INCLUDED_PY_OBJINT_H__
diff --git a/py/objint_mpz.c b/py/objint_mpz.c
index d465ef9..2b27df4 100644
--- a/py/objint_mpz.c
+++ b/py/objint_mpz.c
@@ -326,6 +326,39 @@
     }
 }
 
+#if MICROPY_PY_BUILTINS_POW3
+STATIC mpz_t *mp_mpz_for_int(mp_obj_t arg, mpz_t *temp) {
+    if (MP_OBJ_IS_SMALL_INT(arg)) {
+        mpz_init_from_int(temp, MP_OBJ_SMALL_INT_VALUE(arg));
+        return temp;
+    } else {
+        mp_obj_int_t *arp_p = MP_OBJ_TO_PTR(arg);
+        return &(arp_p->mpz);
+    }
+}
+
+mp_obj_t mp_obj_int_pow3(mp_obj_t base, mp_obj_t exponent,  mp_obj_t modulus) {
+    if (!MP_OBJ_IS_INT(base) || !MP_OBJ_IS_INT(exponent) || !MP_OBJ_IS_INT(modulus)) {
+        mp_raise_TypeError("pow() with 3 arguments requires integers");
+    } else {
+        mp_obj_t result = mp_obj_new_int_from_ull(0); // Use the _from_ull version as this forces an mpz int
+        mp_obj_int_t *res_p = (mp_obj_int_t *) MP_OBJ_TO_PTR(result);
+
+        mpz_t l_temp, r_temp, m_temp;
+        mpz_t *lhs = mp_mpz_for_int(base,     &l_temp);
+        mpz_t *rhs = mp_mpz_for_int(exponent, &r_temp);
+        mpz_t *mod = mp_mpz_for_int(modulus,  &m_temp);
+
+        mpz_pow3_inpl(&(res_p->mpz), lhs, rhs, mod);
+
+        if (lhs == &l_temp) { mpz_deinit(lhs); }
+        if (rhs == &r_temp) { mpz_deinit(rhs); }
+        if (mod == &m_temp) { mpz_deinit(mod); }
+        return result;
+    }
+}
+#endif
+
 mp_obj_t mp_obj_new_int(mp_int_t value) {
     if (MP_SMALL_INT_FITS(value)) {
         return MP_OBJ_NEW_SMALL_INT(value);