From c24e5b5c8aff88a116ae543e06cf44a07e44949e Mon Sep 17 00:00:00 2001
From: Stephen Dolan <mu@netsoc.tcd.ie>
Date: Sat, 4 May 2019 09:01:23 +0100
Subject: [PATCH] Ensure that Gc.minor_words remains accurate after a GC
 (#8619)

If an allocation fails, the decrement of young_ptr should be undone
before the GC is entered. This happened correctly on bytecode but not
on native code.

This commit (squash of pull request #8619) fixes it for all the
platforms supported by ocamlopt.

amd64: add alternate entry points caml_call_gc{1,2,3} for code size
optimisation.

powerpc: introduce one GC call point per allocation size per function.
Each call point corrects the allocation pointer r31 before calling
caml_call_gc.

i386, arm, arm64, s390x: update the allocation pointer after the
conditional branch to the GC, not before.

arm64: simplify the code generator: Ialloc can assume that less than
0x1_0000 bytes are allocated, since the max allocation size for the
minor heap is less than that.

This is a partial cherry-pick of commit 8ceec on multicore.
---
 Changes                                       |  4 +
 asmcomp/amd64/emit.mlp                        | 17 +++-
 asmcomp/arm/emit.mlp                          | 17 ++--
 asmcomp/arm64/emit.mlp                        | 23 ++---
 asmcomp/i386/emit.mlp                         |  2 +-
 asmcomp/power/emit.mlp                        | 87 +++++++++++++------
 asmcomp/s390x/emit.mlp                        |  9 +-
 runtime/amd64.S                               | 25 ++++++
 runtime/amd64nt.asm                           | 24 +++++
 runtime/arm.S                                 | 12 ++-
 runtime/arm64.S                               | 12 ++-
 runtime/i386.S                                |  7 +-
 runtime/i386nt.asm                            |  7 +-
 stdlib/gc.mli                                 |  4 +-
 testsuite/tests/regression/pr7798/ocamltests  |  1 +
 testsuite/tests/regression/pr7798/pr7798.ml   | 57 ++++++++++++
 .../tests/regression/pr7798/pr7798.reference  |  1 +
 17 files changed, 240 insertions(+), 69 deletions(-)
 create mode 100644 testsuite/tests/regression/pr7798/ocamltests
 create mode 100644 testsuite/tests/regression/pr7798/pr7798.ml
 create mode 100644 testsuite/tests/regression/pr7798/pr7798.reference
diff --git a/Changes b/Changes
index 12bf0ecfc..1cc1bce07 100644
--- a/Changes
+++ b/Changes
@@ -156,6 +156,10 @@ Working version
 - #8607: Remove obsolete macros for pre-2002 MSVC support
   (Stephen Dolan, review by Nicolás Ojeda Bär and David Allsopp)
 
+- #8619: Ensure Gc.minor_words remains accurate after a GC.
+  (Stephen Dolan, Xavier Leroy and David Allsopp,
+   review by Xavier Leroy and Gabriel Scherer)
+
 ### Standard library:
 
 - #2262: take precision (.<n>) and flags ('+' and ' ') into account
diff --git a/asmcomp/amd64/emit.mlp b/asmcomp/amd64/emit.mlp
index d16c51790..1a6498a66 100644
--- a/asmcomp/amd64/emit.mlp
+++ b/asmcomp/amd64/emit.mlp
@@ -273,7 +273,8 @@ let spacetime_before_uninstrumented_call ~node_ptr ~index =
 (* Record calls to the GC -- we've moved them out of the way *)
 
 type gc_call =
-  { gc_lbl: label;                      (* Entry label *)
+  { gc_size: int;                       (* Allocation size, in bytes *)
+    gc_lbl: label;                      (* Entry label *)
     gc_return_lbl: label;               (* Where to branch after GC *)
     gc_frame: label;                    (* Label of frame descriptor *)
     gc_spacetime : (X86_ast.arg * int) option;
@@ -290,7 +291,13 @@ let emit_call_gc gc =
     assert Config.spacetime;
     spacetime_before_uninstrumented_call ~node_ptr ~index
   end;
-  emit_call "caml_call_gc";
+  begin match gc.gc_size with
+  | 16 -> emit_call "caml_call_gc1"
+  | 24 -> emit_call "caml_call_gc2"
+  | 32 -> emit_call "caml_call_gc3"
+  | n ->  I.add (int n) r15;
+          emit_call "caml_call_gc"
+  end;
   def_label gc.gc_frame;
   I.jmp (label gc.gc_return_lbl)
 
@@ -660,7 +667,8 @@ let emit_instr fallthrough i =
           else Some (arg i 0, spacetime_index)
         in
         call_gc_sites :=
-          { gc_lbl = lbl_call_gc;
+          { gc_size = n;
+            gc_lbl = lbl_call_gc;
             gc_return_lbl = lbl_redo;
             gc_frame = lbl_frame;
             gc_spacetime; } :: !call_gc_sites
@@ -969,6 +977,9 @@ let begin_assembly() =
     D.extrn "caml_young_limit" QWORD;
     D.extrn "caml_exception_pointer" QWORD;
     D.extrn "caml_call_gc" NEAR;
+    D.extrn "caml_call_gc1" NEAR;
+    D.extrn "caml_call_gc2" NEAR;
+    D.extrn "caml_call_gc3" NEAR;
     D.extrn "caml_c_call" NEAR;
     D.extrn "caml_allocN" NEAR;
     D.extrn "caml_alloc1" NEAR;
diff --git a/asmcomp/arm/emit.mlp b/asmcomp/arm/emit.mlp
index 00d01748f..2d9802c11 100644
--- a/asmcomp/arm/emit.mlp
+++ b/asmcomp/arm/emit.mlp
@@ -633,14 +633,17 @@ let emit_instr i =
         if !fastcode_flag then begin
           let lbl_redo = new_label() in
           `{emit_label lbl_redo}:`;
-          let ninstr = decompose_intconst
-                         (Int32.of_int n)
-                         (fun i ->
-                           `	sub	alloc_ptr, alloc_ptr, #{emit_int32 i}\n`) in
-          `	cmp	alloc_ptr, alloc_limit\n`;
-          `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
+          let first = ref true in
+          let ninstr =
+            decompose_intconst (Int32.of_int (n - 4)) (fun a ->
+              if !first
+              then `	sub	{emit_reg i.res.(0)}, alloc_ptr, #{emit_int32 a}\n`
+              else `	sub	{emit_reg i.res.(0)}, {emit_reg i.res.(0)}, #{emit_int32 a}\n`;
+              first := false) in
+          `	cmp	{emit_reg i.res.(0)}, alloc_limit\n`;
           let lbl_call_gc = new_label() in
-          `	bcc	{emit_label lbl_call_gc}\n`;
+          `	bls	{emit_label lbl_call_gc}\n`;
+          `	sub	alloc_ptr, {emit_reg i.res.(0)}, #4\n`;
           call_gc_sites :=
             { gc_lbl = lbl_call_gc;
               gc_return_lbl = lbl_redo;
diff --git a/asmcomp/arm64/emit.mlp b/asmcomp/arm64/emit.mlp
index a00cbced8..cc575e083 100644
--- a/asmcomp/arm64/emit.mlp
+++ b/asmcomp/arm64/emit.mlp
@@ -527,23 +527,26 @@ let assembly_code_for_allocation ?label_after_call_gc i ~n ~far =
   if !fastcode_flag then begin
     let lbl_redo = new_label() in
     let lbl_call_gc = new_label() in
-    assert (n < 0x1_000_000);
-    let nl = n land 0xFFF and nh = n land 0xFFF_000 in
+    (* n is at most Max_young_whsize * 8, i.e. currently 0x808,
+       so it is reasonable to assume n < 0x1_000.  This makes
+       the generated code simpler. *)
+    assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
+    (* Instead of checking whether young_ptr - n < young_limit, we check whether
+       young_ptr - (n - 8) <= young_limit. It's equivalent, but this way around
+       we can avoid mutating young_ptr on failed allocations, by doing the
+       calculations in i.res.(0) instead of young_ptr. *)
     `{emit_label lbl_redo}:`;
-    if nh <> 0 then
-      `	sub	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int nh}\n`;
-    if nl <> 0 then
-      `	sub	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int nl}\n`;
-    `	cmp	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_limit}\n`;
-    `	add	{emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
+    `	sub	{emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #{emit_int (n - 8)}\n`;
+    `	cmp	{emit_reg i.res.(0)}, {emit_reg reg_alloc_limit}\n`;
     if not far then begin
-      `	b.lo	{emit_label lbl_call_gc}\n`
+      `	b.ls	{emit_label lbl_call_gc}\n`
     end else begin
       let lbl = new_label () in
-      `	b.cs	{emit_label lbl}\n`;
+      `	b.hi	{emit_label lbl}\n`;
       `	b	{emit_label lbl_call_gc}\n`;
       `{emit_label lbl}:\n`
     end;
+    `	sub	{emit_reg reg_alloc_ptr}, {emit_reg i.res.(0)}, #8\n`;
     call_gc_sites :=
       { gc_lbl = lbl_call_gc;
         gc_return_lbl = lbl_redo;
diff --git a/asmcomp/i386/emit.mlp b/asmcomp/i386/emit.mlp
index 9f55cd293..926215134 100644
--- a/asmcomp/i386/emit.mlp
+++ b/asmcomp/i386/emit.mlp
@@ -600,11 +600,11 @@ let emit_instr fallthrough i =
         def_label lbl_redo;
         I.mov (sym32 "caml_young_ptr") eax;
         I.sub (int n) eax;
-        I.mov eax (sym32 "caml_young_ptr");
         I.cmp (sym32 "caml_young_limit") eax;
         let lbl_call_gc = new_label() in
         let lbl_frame = record_frame_label i.live false Debuginfo.none in
         I.jb (label lbl_call_gc);
+        I.mov eax (sym32 "caml_young_ptr");
         I.lea (mem32 NONE 4 RAX) (reg i.res.(0));
         call_gc_sites :=
           { gc_lbl = lbl_call_gc;
diff --git a/asmcomp/power/emit.mlp b/asmcomp/power/emit.mlp
index 558d1a1e8..7bd96e551 100644
--- a/asmcomp/power/emit.mlp
+++ b/asmcomp/power/emit.mlp
@@ -16,7 +16,6 @@
 
 (* Emission of PowerPC assembly code *)
 
-open Misc
 open Cmm
 open Arch
 open Proc
@@ -124,7 +123,7 @@ let emit_gpr = emit_int
 let emit_reg r =
   match r.loc with
   | Reg r -> emit_string (register_name r)
-  | _ -> fatal_error "Emit.emit_reg"
+  | _ -> Misc.fatal_error "Emit.emit_reg"
 
 (* Output a stack reference *)
 
@@ -132,7 +131,7 @@ let emit_stack r =
   match r.loc with
   | Stack s ->
       let ofs = slot_offset s (register_class r) in `{emit_int ofs}(1)`
-  | _ -> fatal_error "Emit.emit_stack"
+  | _ -> Misc.fatal_error "Emit.emit_stack"
 
 (* Output the name of a symbol plus an optional offset *)
 
@@ -393,8 +392,38 @@ let name_for_specific = function
 let function_name = ref ""
 (* Entry point for tail recursive calls *)
 let tailrec_entry_point = ref 0
-(* Label of glue code for calling the GC *)
-let call_gc_label = ref 0
+
+module IntSet = Stdlib.Set.Make(Stdlib.Int)
+module IntMap = Stdlib.Map.Make(Stdlib.Int)
+
+(* Labels of glue code for calling the GC.
+   There is one label per size allocated. *)
+let call_gc_labels : label IntMap.t ref = ref IntMap.empty
+                     (* size -> label *)
+
+(* Return the label of the call GC point for the given size *)
+
+let label_for_call_gc ?label_after_call_gc sz =
+  match IntMap.find_opt sz !call_gc_labels with
+  | Some lbl -> lbl
+  | None ->
+      let lbl =
+        match label_after_call_gc with Some l -> l | None -> new_label() in
+      call_gc_labels := IntMap.add sz lbl !call_gc_labels;
+      lbl
+
+(* Number of call GC points *)
+
+let num_call_gc instr =
+  let rec loop i cg =
+    match i.desc with
+    | Lend -> IntSet.cardinal cg
+    | Lop (Ialloc {bytes = sz}) -> loop i.next (IntSet.add sz cg)
+    (* The following should never be seen, since this function is run
+       before branch relaxation. *)
+    | Lop (Ispecific (Ialloc_far _)) -> assert false
+    | _ -> loop i.next cg
+  in loop instr IntSet.empty
 
 (* Relaxation of branches that exceed the span of a relative branch. *)
 
@@ -560,7 +589,7 @@ let emit_instr i =
             | {loc = Stack _; typ = Float}, {loc = Reg _} ->
                 `	lfd	{emit_reg dst}, {emit_stack src}\n`
             | (_, _) ->
-                fatal_error "Emit: Imove"
+                Misc.fatal_error "Emit: Imove"
         end
     | Lop(Iconst_int n) ->
         if is_native_immediate n then
@@ -751,28 +780,20 @@ let emit_instr i =
           | Double | Double_u -> "stfd" in
         emit_load_store storeinstr addr i.arg 1 i.arg.(0)
     | Lop(Ialloc { bytes = n; label_after_call_gc; }) ->
-        if !call_gc_label = 0 then begin
-          match label_after_call_gc with
-          | None -> call_gc_label := new_label ()
-          | Some label -> call_gc_label := label
-        end;
+        let call_gc_lbl = label_for_call_gc ?label_after_call_gc n in
         `	addi    31, 31, {emit_int(-n)}\n`;
         `	{emit_string cmplg}	31, 30\n`;
         `	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`;
-        `	bltl	{emit_label !call_gc_label}\n`;
+        `	bltl	{emit_label call_gc_lbl}\n`;
         (* Exactly 4 instructions after the beginning of the alloc sequence *)
         record_frame i.live false Debuginfo.none
     | Lop(Ispecific(Ialloc_far { bytes = n; label_after_call_gc; })) ->
-        if !call_gc_label = 0 then begin
-          match label_after_call_gc with
-          | None -> call_gc_label := new_label ()
-          | Some label -> call_gc_label := label
-        end;
+        let call_gc_lbl = label_for_call_gc ?label_after_call_gc n in
         let lbl = new_label() in
         `	addi    31, 31, {emit_int(-n)}\n`;
         `	{emit_string cmplg}	31, 30\n`;
         `	bge	{emit_label lbl}\n`;
-        `	bl	{emit_label !call_gc_label}\n`;
+        `	bl	{emit_label call_gc_lbl}\n`;
         (* Exactly 4 instructions after the beginning of the alloc sequence *)
         record_frame i.live false Debuginfo.none;
         `{emit_label lbl}:	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`
@@ -1009,7 +1030,7 @@ let fundecl fundecl =
   function_name := fundecl.fun_name;
   tailrec_entry_point := fundecl.fun_tailrec_entry_point_label;
   stack_offset := 0;
-  call_gc_label := 0;
+  call_gc_labels := IntMap.empty;
   float_literals := [];
   jumptables := []; jumptables_lbl := -1;
   begin match abi with
@@ -1041,14 +1062,30 @@ let fundecl fundecl =
   end;
   emit_debug_info fundecl.fun_dbg;
   cfi_startproc();
-  (* On this target, there is at most one "out of line" code block per
-     function: a single "call GC" point.  It comes immediately after the
-     function's body. *)
-  BR.relax fundecl.fun_body ~max_out_of_line_code_offset:0;
+  let num_call_gc = num_call_gc fundecl.fun_body in
+  let max_out_of_line_code_offset = max (num_call_gc - 1) 0 in
+  BR.relax fundecl.fun_body ~max_out_of_line_code_offset;
   emit_all fundecl.fun_body;
   (* Emit the glue code to call the GC *)
-  if !call_gc_label > 0 then begin
-    `{emit_label !call_gc_label}:\n`;
+  assert (IntMap.cardinal !call_gc_labels = num_call_gc);
+  if num_call_gc > 0 then begin
+    (* Replace sizes by deltas with next size *)
+    let rec delta_encode = function
+      | (sz1, lbl1) :: ((sz2, _) :: _ as l) ->
+           (sz1 - sz2, lbl1) :: delta_encode l
+      | ([] | [(_,_)]) as l -> l in
+    (* Enumerate the GC call points by decreasing size.  This is not
+       necessary for correctness, but it is nice for two reasons:
+       1- all deltas are positive, making the generated code
+          easier to read, and
+       2- smaller allocation sizes, which are more frequent, execute
+          fewer instructions before calling the GC. *)
+    let delta_lbl_list =
+      delta_encode (List.rev (IntMap.bindings !call_gc_labels)) in
+    List.iter
+      (fun (delta, lbl) ->
+        `{emit_label lbl}:	addi	31, 31, {emit_int delta}\n`)
+      delta_lbl_list;
     match abi with
     | ELF32 ->
       `	b	{emit_symbol "caml_call_gc"}\n`
diff --git a/asmcomp/s390x/emit.mlp b/asmcomp/s390x/emit.mlp
index 619b454fe..4cf31436f 100644
--- a/asmcomp/s390x/emit.mlp
+++ b/asmcomp/s390x/emit.mlp
@@ -429,10 +429,11 @@ let emit_instr i =
             gc_return_lbl = lbl_redo;
             gc_frame_lbl = lbl_frame } :: !call_gc_sites;
         `{emit_label lbl_redo}:`;
-        `	lay	%r11, {emit_int(-n)}(%r11)\n`;
-        `	clgr	%r11, %r10\n`;
-        `	brcl	4, {emit_label lbl_call_gc}\n`;  (* less than *)
-        `	la	{emit_reg i.res.(0)}, 8(%r11)\n`
+        `	lay	{emit_reg i.res.(0)}, {emit_int(-n+8)}(%r11)\n`;
+        `	clgr	{emit_reg i.res.(0)}, %r10\n`;
+        `	brcl	12, {emit_label lbl_call_gc}\n`;
+                                                 (* less than or equal *)
+        `	lay	%r11, -8({emit_reg i.res.(0)})\n`
 
     | Lop(Iintop Imulh) ->
        (* Hacker's Delight section 8.3:
diff --git a/runtime/amd64.S b/runtime/amd64.S
index 1a024283e..66cd00027 100644
--- a/runtime/amd64.S
+++ b/runtime/amd64.S
@@ -397,6 +397,7 @@ LBL(caml_alloc1):
         jb      LBL(100)
         ret
 LBL(100):
+        addq    $16, %r15
         RECORD_STACK_FRAME(0)
         ENTER_FUNCTION
 /*        subq    $8, %rsp; CFI_ADJUST (8); */
@@ -414,6 +415,7 @@ LBL(caml_alloc2):
         jb      LBL(101)
         ret
 LBL(101):
+        addq    $24, %r15
         RECORD_STACK_FRAME(0)
         ENTER_FUNCTION
 /*        subq    $8, %rsp; CFI_ADJUST (8); */
@@ -431,6 +433,7 @@ LBL(caml_alloc3):
         jb      LBL(102)
         ret
 LBL(102):
+        addq    $32, %r15
         RECORD_STACK_FRAME(0)
         ENTER_FUNCTION
 /*        subq    $8, %rsp; CFI_ADJUST (8) */
@@ -450,6 +453,7 @@ LBL(caml_allocN):
         addq    $8, %rsp; CFI_ADJUST (-8)  /* drop desired size */
         ret
 LBL(103):
+        addq    0(%rsp), %r15
         CFI_ADJUST(8)
         RECORD_STACK_FRAME(8)
 #ifdef WITH_FRAME_POINTERS
@@ -467,6 +471,27 @@ LBL(103):
         jmp     LBL(caml_allocN)
 CFI_ENDPROC
 
+/* Reset the allocation pointer and invoke the GC */
+
+FUNCTION(G(caml_call_gc1))
+CFI_STARTPROC
+        addq    $16, %r15
+        jmp     GCALL(caml_call_gc)
+CFI_ENDPROC
+
+FUNCTION(G(caml_call_gc2))
+CFI_STARTPROC
+        addq    $24, %r15
+        jmp     GCALL(caml_call_gc)
+CFI_ENDPROC
+
+FUNCTION(G(caml_call_gc3))
+CFI_STARTPROC
+        addq    $32, %r15
+        jmp     GCALL(caml_call_gc)
+CFI_ENDPROC
+
+
 /* Call a C function from OCaml */
 
 FUNCTION(G(caml_c_call))
diff --git a/runtime/amd64nt.asm b/runtime/amd64nt.asm
index f7509ce16..e6b173771 100644
--- a/runtime/amd64nt.asm
+++ b/runtime/amd64nt.asm
@@ -150,6 +150,7 @@ caml_alloc1:
         jb      L100
         ret
 L100:
+        add     r15, 16
         mov     rax, [rsp + 0]
         mov     caml_last_return_address, rax
         lea     rax, [rsp + 8]
@@ -167,6 +168,7 @@ caml_alloc2:
         jb      L101
         ret
 L101:
+        add     r15, 24
         mov     rax, [rsp + 0]
         mov     caml_last_return_address, rax
         lea     rax, [rsp + 8]
@@ -184,6 +186,7 @@ caml_alloc3:
         jb      L102
         ret
 L102:
+        add     r15, 32
         mov     rax, [rsp + 0]
         mov     caml_last_return_address, rax
         lea     rax, [rsp + 8]
@@ -201,6 +204,7 @@ caml_allocN:
         jb      L103
         ret
 L103:
+        add     r15, rax
         push    rax                       ; save desired size
         mov     rax, [rsp + 8]
         mov     caml_last_return_address, rax
@@ -210,6 +214,26 @@ L103:
         pop     rax                      ; recover desired size
         jmp     caml_allocN
 
+; Reset the allocation pointer and invoke the GC
+
+        PUBLIC  caml_call_gc1
+        ALIGN   16
+caml_call_gc1:
+        add     r15, 16
+        jmp     caml_call_gc
+
+        PUBLIC  caml_call_gc2
+        ALIGN   16
+caml_call_gc2:
+        add     r15, 24
+        jmp     caml_call_gc
+
+        PUBLIC  caml_call_gc3
+        ALIGN 16
+caml_call_gc3:
+        add     r15, 32
+        jmp     caml_call_gc
+
 ; Call a C function from OCaml
 
         PUBLIC  caml_c_call
diff --git a/runtime/arm.S b/runtime/arm.S
index fd43b2141..2f2275cb8 100644
--- a/runtime/arm.S
+++ b/runtime/arm.S
@@ -163,7 +163,8 @@ caml_alloc1:
         cmp     alloc_ptr, alloc_limit
         bcc     1f
         bx      lr
-1:  /* Record return address */
+1:      add     alloc_ptr, alloc_ptr, 8
+    /* Record return address */
         ldr     r7, =caml_last_return_address
         str     lr, [r7]
     /* Call GC (preserves r7) */
@@ -185,7 +186,8 @@ caml_alloc2:
         cmp     alloc_ptr, alloc_limit
         bcc     1f
         bx      lr
-1:  /* Record return address */
+1:      add     alloc_ptr, alloc_ptr, 12
+    /* Record return address */
         ldr     r7, =caml_last_return_address
         str     lr, [r7]
     /* Call GC (preserves r7) */
@@ -208,7 +210,8 @@ caml_alloc3:
         cmp     alloc_ptr, alloc_limit
         bcc     1f
         bx      lr
-1:  /* Record return address */
+1:      add     alloc_ptr, alloc_ptr, 16
+    /* Record return address */
         ldr     r7, =caml_last_return_address
         str     lr, [r7]
     /* Call GC (preserves r7) */
@@ -230,7 +233,8 @@ caml_allocN:
         cmp     alloc_ptr, alloc_limit
         bcc     1f
         bx      lr
-1:  /* Record return address */
+1:      add     alloc_ptr, alloc_ptr, r7
+    /* Record return address */
         ldr     r12, =caml_last_return_address
         str     lr, [r12]
     /* Call GC (preserves r7) */
diff --git a/runtime/arm64.S b/runtime/arm64.S
index f78572639..7c26ddad2 100644
--- a/runtime/arm64.S
+++ b/runtime/arm64.S
@@ -185,7 +185,8 @@ caml_alloc1:
         cmp     ALLOC_PTR, ALLOC_LIMIT
         b.lo    2f
         ret
-2:      stp     x29, x30, [sp, -16]!
+2:      add     ALLOC_PTR, ALLOC_PTR, #16
+        stp     x29, x30, [sp, -16]!
         CFI_ADJUST(16)
     /* Record the lowest address of the caller's stack frame.  This is the
        address immediately above the pair of words (x29 and x30) we just
@@ -217,7 +218,8 @@ caml_alloc2:
         cmp     ALLOC_PTR, ALLOC_LIMIT
         b.lo    2f
         ret
-2:      stp     x29, x30, [sp, -16]!
+2:      add     ALLOC_PTR, ALLOC_PTR, #24
+        stp     x29, x30, [sp, -16]!
         CFI_ADJUST(16)
     /* Record the lowest address of the caller's stack frame.
        See comment above. */
@@ -245,7 +247,8 @@ caml_alloc3:
         cmp     ALLOC_PTR, ALLOC_LIMIT
         b.lo    2f
         ret
-2:      stp     x29, x30, [sp, -16]!
+2:      add     ALLOC_PTR, ALLOC_PTR, #32
+        stp     x29, x30, [sp, -16]!
         CFI_ADJUST(16)
     /* Record the lowest address of the caller's stack frame.
        See comment above. */
@@ -273,7 +276,8 @@ caml_allocN:
         cmp     ALLOC_PTR, ALLOC_LIMIT
         b.lo    2f
         ret
-2:      stp     x29, x30, [sp, -16]!
+2:      add     ALLOC_PTR, ALLOC_PTR, ARG
+        stp     x29, x30, [sp, -16]!
         CFI_ADJUST(16)
     /* Record the lowest address of the caller's stack frame.
        See comment above. */
diff --git a/runtime/i386.S b/runtime/i386.S
index 314dfe43b..007b3a01e 100644
--- a/runtime/i386.S
+++ b/runtime/i386.S
@@ -113,9 +113,9 @@ FUNCTION(caml_alloc1)
         CFI_STARTPROC
         movl    G(caml_young_ptr), %eax
         subl    $8, %eax
-        movl    %eax, G(caml_young_ptr)
         cmpl    G(caml_young_limit), %eax
         jb      LBL(100)
+        movl    %eax, G(caml_young_ptr)
         ret
 LBL(100):
         movl    0(%esp), %eax
@@ -132,9 +132,9 @@ FUNCTION(caml_alloc2)
         CFI_STARTPROC
         movl    G(caml_young_ptr), %eax
         subl    $12, %eax
-        movl    %eax, G(caml_young_ptr)
         cmpl    G(caml_young_limit), %eax
         jb      LBL(101)
+        movl    %eax, G(caml_young_ptr)
         ret
 LBL(101):
         movl    0(%esp), %eax
@@ -151,9 +151,9 @@ FUNCTION(caml_alloc3)
         CFI_STARTPROC
         movl    G(caml_young_ptr), %eax
         subl    $16, %eax
-        movl    %eax, G(caml_young_ptr)
         cmpl    G(caml_young_limit), %eax
         jb      LBL(102)
+        movl    %eax, G(caml_young_ptr)
         ret
 LBL(102):
         movl    0(%esp), %eax
@@ -178,7 +178,6 @@ LBL(103):
         subl    G(caml_young_ptr), %eax /* eax = - size */
         negl    %eax                    /* eax = size */
         pushl   %eax; CFI_ADJUST(4)     /* save desired size */
-        subl    %eax, G(caml_young_ptr) /* must update young_ptr */
         movl    4(%esp), %eax
         movl    %eax, G(caml_last_return_address)
         leal    8(%esp), %eax
diff --git a/runtime/i386nt.asm b/runtime/i386nt.asm
index b67306769..b8d5cc4bc 100644
--- a/runtime/i386nt.asm
+++ b/runtime/i386nt.asm
@@ -74,9 +74,9 @@ L105:   push    ebp
 _caml_alloc1:
         mov     eax, _caml_young_ptr
         sub     eax, 8
-        mov     _caml_young_ptr, eax
         cmp     eax, _caml_young_limit
         jb      L100
+        mov     _caml_young_ptr, eax
         ret
 L100:   mov     eax, [esp]
         mov     _caml_last_return_address, eax
@@ -89,9 +89,9 @@ L100:   mov     eax, [esp]
 _caml_alloc2:
         mov     eax, _caml_young_ptr
         sub     eax, 12
-        mov     _caml_young_ptr, eax
         cmp     eax, _caml_young_limit
         jb      L101
+        mov     _caml_young_ptr, eax
         ret
 L101:   mov     eax, [esp]
         mov     _caml_last_return_address, eax
@@ -104,9 +104,9 @@ L101:   mov     eax, [esp]
 _caml_alloc3:
         mov     eax, _caml_young_ptr
         sub     eax, 16
-        mov     _caml_young_ptr, eax
         cmp     eax, _caml_young_limit
         jb      L102
+        mov     _caml_young_ptr, eax
         ret
 L102:   mov     eax, [esp]
         mov     _caml_last_return_address, eax
@@ -126,7 +126,6 @@ _caml_allocN:
 L103:   sub     eax, _caml_young_ptr         ; eax = - size
         neg     eax                     ; eax = size
         push    eax                     ; save desired size
-        sub     _caml_young_ptr, eax         ; must update young_ptr
         mov     eax, [esp+4]
         mov     _caml_last_return_address, eax
         lea     eax, [esp+8]
diff --git a/stdlib/gc.mli b/stdlib/gc.mli
index 47e7dedf4..813a4ada0 100644
--- a/stdlib/gc.mli
+++ b/stdlib/gc.mli
@@ -18,9 +18,7 @@
 type stat =
   { minor_words : float;
     (** Number of words allocated in the minor heap since
-       the program was started.  This number is accurate in
-       byte-code programs, but only an approximation in programs
-       compiled to native code. *)
+       the program was started. *)
 
     promoted_words : float;
     (** Number of words allocated in the minor heap that
diff --git a/testsuite/tests/regression/pr7798/ocamltests b/testsuite/tests/regression/pr7798/ocamltests
new file mode 100644
index 000000000..86ca3eab1
--- /dev/null
+++ b/testsuite/tests/regression/pr7798/ocamltests
@@ -0,0 +1 @@
+pr7798.ml
diff --git a/testsuite/tests/regression/pr7798/pr7798.ml b/testsuite/tests/regression/pr7798/pr7798.ml
new file mode 100644
index 000000000..a91b4dc2f
--- /dev/null
+++ b/testsuite/tests/regression/pr7798/pr7798.ml
@@ -0,0 +1,57 @@
+(* TEST
+   * bytecode
+   * native
+   * native
+     ocamlopt_flags = "-compact"
+*)
+
+type mut2 = { mutable p: int; mutable q:int }
+type mut3 = { mutable s: int; mutable t:int; mutable u:int }
+
+type mut_record =
+  { mutable a : int;
+    mutable b : int;
+    mutable c : int;
+    mutable d : int;
+    mutable e : int;
+    mutable f : int; }
+
+let go () =
+  let pre_before = Gc.minor_words () in
+  let before = Gc.minor_words () in
+  let alloc_per_minor_words = int_of_float (before -. pre_before) in
+  if Sys.backend_type = Sys.Native then assert (alloc_per_minor_words = 0);
+  let allocs = ref alloc_per_minor_words in
+  let n = 1_000_000 in
+  for i = 1 to n do
+    Sys.opaque_identity (ref i)
+    |> ignore;
+    allocs := !allocs + 2;
+  done;
+  for i = 1 to n do
+    Sys.opaque_identity { p = i; q = i }
+    |> ignore;
+    allocs := !allocs + 3;
+  done;
+  for i = 1 to n do
+    Sys.opaque_identity { s = i; t = i; u = i }
+    |> ignore;
+    allocs := !allocs + 4;
+  done;
+  for i = 1 to n do
+    Sys.opaque_identity { a = i; b = i; c = i; d = i; e = i; f = i }
+    |> ignore;
+    allocs := !allocs + 7;
+    if i mod (n/3) == 0 then Gc.full_major ();
+  done;
+  for i = 1 to n do
+    Sys.opaque_identity (Array.make 8 i)
+    |> ignore;
+    allocs := !allocs + 9;
+    if i mod (n/3) == 0 then Gc.compact ();
+  done;
+  let after = Gc.minor_words () in
+  let measured_allocs = int_of_float (after -. before) - alloc_per_minor_words in
+  Printf.printf "%d\n" (measured_allocs - !allocs)
+
+let () = go ()
diff --git a/testsuite/tests/regression/pr7798/pr7798.reference b/testsuite/tests/regression/pr7798/pr7798.reference
new file mode 100644
index 000000000..573541ac9
--- /dev/null
+++ b/testsuite/tests/regression/pr7798/pr7798.reference
@@ -0,0 +1 @@
+0