diff --git a/boehmgc-coroutine-sp-fallback.diff b/boehmgc-coroutine-sp-fallback.diff
new file mode 100644
index 0000000000000000000000000000000000000000..fa8dd0325b952b4efa3e45df6e10fa8a63e1bb07
--- /dev/null
+++ b/boehmgc-coroutine-sp-fallback.diff
@@ -0,0 +1,42 @@
+diff --git a/pthread_stop_world.c b/pthread_stop_world.c
+index 1cee6a0b..46c3acd9 100644
+--- a/pthread_stop_world.c
++++ b/pthread_stop_world.c
+@@ -674,6 +674,8 @@ GC_INNER void GC_push_all_stacks(void)
+     struct GC_traced_stack_sect_s *traced_stack_sect;
+     pthread_t self = pthread_self();
+     word total_size = 0;
++    size_t stack_limit;
++    pthread_attr_t pattr;
+ 
+     if (!EXPECT(GC_thr_initialized, TRUE))
+       GC_thr_init();
+@@ -723,6 +725,28 @@ GC_INNER void GC_push_all_stacks(void)
+           hi = p->altstack + p->altstack_size;
+           /* FIXME: Need to scan the normal stack too, but how ? */
+           /* FIXME: Assume stack grows down */
++        } else {
++          if (pthread_getattr_np(p->id, &pattr)) {
++            ABORT("GC_push_all_stacks: pthread_getattr_np failed!");
++          }
++          if (pthread_attr_getstacksize(&pattr, &stack_limit)) {
++            ABORT("GC_push_all_stacks: pthread_attr_getstacksize failed!");
++          }
++          // When a thread goes into a coroutine, we lose its original sp until
++          // control flow returns to the thread.
++          // While in the coroutine, the sp points outside the thread stack,
++          // so we can detect this and push the entire thread stack instead,
++          // as an approximation.
++          // We assume that the coroutine has similarly added its entire stack.
++          // This could be made accurate by cooperating with the application
++          // via new functions and/or callbacks.
++          #ifndef STACK_GROWS_UP
++            if (lo >= hi || lo < hi - stack_limit) { // sp outside stack
++              lo = hi - stack_limit;
++            }
++          #else
++          #error "STACK_GROWS_UP not supported in boost_coroutine2 (as of june 2021), so we don't support it in Nix."
++          #endif
+         }
+         GC_push_all_stack_sections(lo, hi, traced_stack_sect);
+ #       ifdef STACK_GROWS_UP
diff --git a/flake.nix b/flake.nix
index c97130ac94a192e362b6b69553f218f65690f2ad..999028ec273368757536a8849d2c4aaab271a389 100644
--- a/flake.nix
+++ b/flake.nix
@@ -104,7 +104,13 @@
           });
 
         propagatedDeps =
-          [ (boehmgc.override { enableLargeConfig = true; })
+          [ ((boehmgc.override {
+              enableLargeConfig = true;
+            }).overrideAttrs(o: {
+              patches = (o.patches or []) ++ [
+                ./boehmgc-coroutine-sp-fallback.diff
+              ];
+            }))
           ];
 
         perlDeps =
diff --git a/src/libexpr/eval.cc b/src/libexpr/eval.cc
index ef9f8efca96c2593f32e1a3976883cc47f73e0f3..c078bf4a1771482a4c70b33927fe4d0c04208168 100644
--- a/src/libexpr/eval.cc
+++ b/src/libexpr/eval.cc
@@ -233,22 +233,34 @@ static void * oomHandler(size_t requested)
 }
 
 class BoehmGCStackAllocator : public StackAllocator {
-  boost::coroutines2::protected_fixedsize_stack stack {
-    // We allocate 8 MB, the default max stack size on NixOS.
-    // A smaller stack might be quicker to allocate but reduces the stack
-    // depth available for source filter expressions etc.
-    std::max(boost::context::stack_traits::default_size(), static_cast<std::size_t>(8 * 1024 * 1024))
+    boost::coroutines2::protected_fixedsize_stack stack {
+        // We allocate 8 MB, the default max stack size on NixOS.
+        // A smaller stack might be quicker to allocate but reduces the stack
+        // depth available for source filter expressions etc.
+        std::max(boost::context::stack_traits::default_size(), static_cast<std::size_t>(8 * 1024 * 1024))
     };
 
+    // This is specific to boost::coroutines2::protected_fixedsize_stack.
+    // The stack protection page is included in sctx.size, so we have to
+    // subtract one page size from the stack size.
+    std::size_t pfss_usable_stack_size(boost::context::stack_context &sctx) {
+        return sctx.size - boost::context::stack_traits::page_size();
+    }
+
   public:
     boost::context::stack_context allocate() override {
         auto sctx = stack.allocate();
-        GC_add_roots(static_cast<char *>(sctx.sp) - sctx.size, sctx.sp);
+
+        // Stacks generally start at a high address and grow to lower addresses.
+        // Architectures that do the opposite are rare; in fact so rare that
+        // boost_routine does not implement it.
+        // So we subtract the stack size.
+        GC_add_roots(static_cast<char *>(sctx.sp) - pfss_usable_stack_size(sctx), sctx.sp);
         return sctx;
     }
 
     void deallocate(boost::context::stack_context sctx) override {
-        GC_remove_roots(static_cast<char *>(sctx.sp) - sctx.size, sctx.sp);
+        GC_remove_roots(static_cast<char *>(sctx.sp) - pfss_usable_stack_size(sctx), sctx.sp);
         stack.deallocate(sctx);
     }