Ensure that Gc.minor_words remains accurate after a GC (#8619)

If an allocation fails, the decrement of young_ptr should be undone before the GC is entered. This happened correctly on bytecode but not on native code. This commit (squash of pull request #8619) fixes it for all the platforms supported by ocamlopt. amd64: add alternate entry points caml_call_gc{1,2,3} for code size optimisation. powerpc: introduce one GC call point per allocation size per function. Each call point corrects the allocation pointer r31 before calling caml_call_gc. i386, arm, arm64, s390x: update the allocation pointer after the conditional branch to the GC, not before. arm64: simplify the code generator: Ialloc can assume that less than 0x1_0000 bytes are allocated, since the max allocation size for the minor heap is less than that. This is a partial cherry-pick of commit 8ceec on multicore.
2019-05-04 09:01:23 +01:00 · 2019-05-04 09:01:23 +01:00 · c24e5b5c8a
parent b818e2af91
commit c24e5b5c8a
17 changed files with 240 additions and 69 deletions
--- a/4
+++ b/4
@ -156,6 +156,10 @@ Working version
 - #8607: Remove obsolete macros for pre-2002 MSVC support
  (Stephen Dolan, review by Nicolás Ojeda Bär and David Allsopp)

+- #8619: Ensure Gc.minor_words remains accurate after a GC.
+  (Stephen Dolan, Xavier Leroy and David Allsopp,
+   review by Xavier Leroy and Gabriel Scherer)
+
 ### Standard library:

 - #2262: take precision (.<n>) and flags ('+' and ' ') into account
--- a/asmcomp/amd64/emit.mlp
+++ b/asmcomp/amd64/emit.mlp
@ -273,7 +273,8 @@ let spacetime_before_uninstrumented_call ~node_ptr ~index =
 (* Record calls to the GC -- we've moved them out of the way *)

 type gc_call =
-  { gc_lbl: label;                      (* Entry label *)
+  { gc_size: int;                       (* Allocation size, in bytes *)
+    gc_lbl: label;                      (* Entry label *)
    gc_return_lbl: label;               (* Where to branch after GC *)
    gc_frame: label;                    (* Label of frame descriptor *)
    gc_spacetime : (X86_ast.arg * int) option;
@ -290,7 +291,13 @@ let emit_call_gc gc =
    assert Config.spacetime;
    spacetime_before_uninstrumented_call ~node_ptr ~index
  end;
-  emit_call "caml_call_gc";
+  begin match gc.gc_size with
+  | 16 -> emit_call "caml_call_gc1"
+  | 24 -> emit_call "caml_call_gc2"
+  | 32 -> emit_call "caml_call_gc3"
+  | n ->  I.add (int n) r15;
+          emit_call "caml_call_gc"
+  end;
  def_label gc.gc_frame;
  I.jmp (label gc.gc_return_lbl)

@ -660,7 +667,8 @@ let emit_instr fallthrough i =
          else Some (arg i 0, spacetime_index)
        in
        call_gc_sites :=
-          { gc_lbl = lbl_call_gc;
+          { gc_size = n;
+            gc_lbl = lbl_call_gc;
            gc_return_lbl = lbl_redo;
            gc_frame = lbl_frame;
            gc_spacetime; } :: !call_gc_sites
@ -969,6 +977,9 @@ let begin_assembly() =
    D.extrn "caml_young_limit" QWORD;
    D.extrn "caml_exception_pointer" QWORD;
    D.extrn "caml_call_gc" NEAR;
+    D.extrn "caml_call_gc1" NEAR;
+    D.extrn "caml_call_gc2" NEAR;
+    D.extrn "caml_call_gc3" NEAR;
    D.extrn "caml_c_call" NEAR;
    D.extrn "caml_allocN" NEAR;
    D.extrn "caml_alloc1" NEAR;
--- a/asmcomp/arm/emit.mlp
+++ b/asmcomp/arm/emit.mlp
@ -633,14 +633,17 @@ let emit_instr i =
        if !fastcode_flag then begin
          let lbl_redo = new_label() in
          `{emit_label lbl_redo}:`;
-          let ninstr = decompose_intconst
-                         (Int32.of_int n)
-                         (fun i ->
-                           `	sub	alloc_ptr, alloc_ptr, #{emit_int32 i}\n`) in
-          `	cmp	alloc_ptr, alloc_limit\n`;
-          `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
+          let first = ref true in
+          let ninstr =
+            decompose_intconst (Int32.of_int (n - 4)) (fun a ->
+              if !first
+              then `	sub	{emit_reg i.res.(0)}, alloc_ptr, #{emit_int32 a}\n`
+              else `	sub	{emit_reg i.res.(0)}, {emit_reg i.res.(0)}, #{emit_int32 a}\n`;
+              first := false) in
+          `	cmp	{emit_reg i.res.(0)}, alloc_limit\n`;
          let lbl_call_gc = new_label() in
-          `	bcc	{emit_label lbl_call_gc}\n`;
+          `	bls	{emit_label lbl_call_gc}\n`;
+          `	sub	alloc_ptr, {emit_reg i.res.(0)}, #4\n`;
          call_gc_sites :=
            { gc_lbl = lbl_call_gc;
              gc_return_lbl = lbl_redo;
--- a/asmcomp/arm64/emit.mlp
+++ b/asmcomp/arm64/emit.mlp
@ -527,23 +527,26 @@ let assembly_code_for_allocation ?label_after_call_gc i ~n ~far =
  if !fastcode_flag then begin
    let lbl_redo = new_label() in
    let lbl_call_gc = new_label() in
-    assert (n < 0x1_000_000);
-    let nl = n land 0xFFF and nh = n land 0xFFF_000 in
+    (* n is at most Max_young_whsize * 8, i.e. currently 0x808,
+       so it is reasonable to assume n < 0x1_000.  This makes
+       the generated code simpler. *)
+    assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
+    (* Instead of checking whether young_ptr - n < young_limit, we check whether
+       young_ptr - (n - 8) <= young_limit. It's equivalent, but this way around
+       we can avoid mutating young_ptr on failed allocations, by doing the
+       calculations in i.res.(0) instead of young_ptr. *)
    `{emit_label lbl_redo}:`;
-    if nh <> 0 then
-      `	sub	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int nh}\n`;
-    if nl <> 0 then
-      `	sub	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int nl}\n`;
-    `	cmp	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_limit}\n`;
-    `	add	{emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
+    `	sub	{emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #{emit_int (n - 8)}\n`;
+    `	cmp	{emit_reg i.res.(0)}, {emit_reg reg_alloc_limit}\n`;
    if not far then begin
-      `	b.lo	{emit_label lbl_call_gc}\n`
+      `	b.ls	{emit_label lbl_call_gc}\n`
    end else begin
      let lbl = new_label () in
-      `	b.cs	{emit_label lbl}\n`;
+      `	b.hi	{emit_label lbl}\n`;
      `	b	{emit_label lbl_call_gc}\n`;
      `{emit_label lbl}:\n`
    end;
+    `	sub	{emit_reg reg_alloc_ptr}, {emit_reg i.res.(0)}, #8\n`;
    call_gc_sites :=
      { gc_lbl = lbl_call_gc;
        gc_return_lbl = lbl_redo;
--- a/asmcomp/i386/emit.mlp
+++ b/asmcomp/i386/emit.mlp
@ -600,11 +600,11 @@ let emit_instr fallthrough i =
        def_label lbl_redo;
        I.mov (sym32 "caml_young_ptr") eax;
        I.sub (int n) eax;
-        I.mov eax (sym32 "caml_young_ptr");
        I.cmp (sym32 "caml_young_limit") eax;
        let lbl_call_gc = new_label() in
        let lbl_frame = record_frame_label i.live false Debuginfo.none in
        I.jb (label lbl_call_gc);
+        I.mov eax (sym32 "caml_young_ptr");
        I.lea (mem32 NONE 4 RAX) (reg i.res.(0));
        call_gc_sites :=
          { gc_lbl = lbl_call_gc;
--- a/asmcomp/power/emit.mlp
+++ b/asmcomp/power/emit.mlp
@ -16,7 +16,6 @@

 (* Emission of PowerPC assembly code *)

-open Misc
 open Cmm
 open Arch
 open Proc
@ -124,7 +123,7 @@ let emit_gpr = emit_int
 let emit_reg r =
  match r.loc with
  | Reg r -> emit_string (register_name r)
-  | _ -> fatal_error "Emit.emit_reg"
+  | _ -> Misc.fatal_error "Emit.emit_reg"

 (* Output a stack reference *)

@ -132,7 +131,7 @@ let emit_stack r =
  match r.loc with
  | Stack s ->
      let ofs = slot_offset s (register_class r) in `{emit_int ofs}(1)`
-  | _ -> fatal_error "Emit.emit_stack"
+  | _ -> Misc.fatal_error "Emit.emit_stack"

 (* Output the name of a symbol plus an optional offset *)

@ -393,8 +392,38 @@ let name_for_specific = function
 let function_name = ref ""
 (* Entry point for tail recursive calls *)
 let tailrec_entry_point = ref 0
-(* Label of glue code for calling the GC *)
-let call_gc_label = ref 0
+
+module IntSet = Stdlib.Set.Make(Stdlib.Int)
+module IntMap = Stdlib.Map.Make(Stdlib.Int)
+
+(* Labels of glue code for calling the GC.
+   There is one label per size allocated. *)
+let call_gc_labels : label IntMap.t ref = ref IntMap.empty
+                     (* size -> label *)
+
+(* Return the label of the call GC point for the given size *)
+
+let label_for_call_gc ?label_after_call_gc sz =
+  match IntMap.find_opt sz !call_gc_labels with
+  | Some lbl -> lbl
+  | None ->
+      let lbl =
+        match label_after_call_gc with Some l -> l | None -> new_label() in
+      call_gc_labels := IntMap.add sz lbl !call_gc_labels;
+      lbl
+
+(* Number of call GC points *)
+
+let num_call_gc instr =
+  let rec loop i cg =
+    match i.desc with
+    | Lend -> IntSet.cardinal cg
+    | Lop (Ialloc {bytes = sz}) -> loop i.next (IntSet.add sz cg)
+    (* The following should never be seen, since this function is run
+       before branch relaxation. *)
+    | Lop (Ispecific (Ialloc_far _)) -> assert false
+    | _ -> loop i.next cg
+  in loop instr IntSet.empty

 (* Relaxation of branches that exceed the span of a relative branch. *)

@ -560,7 +589,7 @@ let emit_instr i =
            | {loc = Stack _; typ = Float}, {loc = Reg _} ->
                `	lfd	{emit_reg dst}, {emit_stack src}\n`
            | (_, _) ->
-                fatal_error "Emit: Imove"
+                Misc.fatal_error "Emit: Imove"
        end
    | Lop(Iconst_int n) ->
        if is_native_immediate n then
@ -751,28 +780,20 @@ let emit_instr i =
          | Double | Double_u -> "stfd" in
        emit_load_store storeinstr addr i.arg 1 i.arg.(0)
    | Lop(Ialloc { bytes = n; label_after_call_gc; }) ->
-        if !call_gc_label = 0 then begin
-          match label_after_call_gc with
-          | None -> call_gc_label := new_label ()
-          | Some label -> call_gc_label := label
-        end;
+        let call_gc_lbl = label_for_call_gc ?label_after_call_gc n in
        `	addi    31, 31, {emit_int(-n)}\n`;
        `	{emit_string cmplg}	31, 30\n`;
        `	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`;
-        `	bltl	{emit_label !call_gc_label}\n`;
+        `	bltl	{emit_label call_gc_lbl}\n`;
        (* Exactly 4 instructions after the beginning of the alloc sequence *)
        record_frame i.live false Debuginfo.none
    | Lop(Ispecific(Ialloc_far { bytes = n; label_after_call_gc; })) ->
-        if !call_gc_label = 0 then begin
-          match label_after_call_gc with
-          | None -> call_gc_label := new_label ()
-          | Some label -> call_gc_label := label
-        end;
+        let call_gc_lbl = label_for_call_gc ?label_after_call_gc n in
        let lbl = new_label() in
        `	addi    31, 31, {emit_int(-n)}\n`;
        `	{emit_string cmplg}	31, 30\n`;
        `	bge	{emit_label lbl}\n`;
-        `	bl	{emit_label !call_gc_label}\n`;
+        `	bl	{emit_label call_gc_lbl}\n`;
        (* Exactly 4 instructions after the beginning of the alloc sequence *)
        record_frame i.live false Debuginfo.none;
        `{emit_label lbl}:	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`
@ -1009,7 +1030,7 @@ let fundecl fundecl =
  function_name := fundecl.fun_name;
  tailrec_entry_point := fundecl.fun_tailrec_entry_point_label;
  stack_offset := 0;
-  call_gc_label := 0;
+  call_gc_labels := IntMap.empty;
  float_literals := [];
  jumptables := []; jumptables_lbl := -1;
  begin match abi with
@ -1041,14 +1062,30 @@ let fundecl fundecl =
  end;
  emit_debug_info fundecl.fun_dbg;
  cfi_startproc();
-  (* On this target, there is at most one "out of line" code block per
-     function: a single "call GC" point.  It comes immediately after the
-     function's body. *)
-  BR.relax fundecl.fun_body ~max_out_of_line_code_offset:0;
+  let num_call_gc = num_call_gc fundecl.fun_body in
+  let max_out_of_line_code_offset = max (num_call_gc - 1) 0 in
+  BR.relax fundecl.fun_body ~max_out_of_line_code_offset;
  emit_all fundecl.fun_body;
  (* Emit the glue code to call the GC *)
-  if !call_gc_label > 0 then begin
-    `{emit_label !call_gc_label}:\n`;
+  assert (IntMap.cardinal !call_gc_labels = num_call_gc);
+  if num_call_gc > 0 then begin
+    (* Replace sizes by deltas with next size *)
+    let rec delta_encode = function
+      | (sz1, lbl1) :: ((sz2, _) :: _ as l) ->
+           (sz1 - sz2, lbl1) :: delta_encode l
+      | ([] | [(_,_)]) as l -> l in
+    (* Enumerate the GC call points by decreasing size.  This is not
+       necessary for correctness, but it is nice for two reasons:
+       1- all deltas are positive, making the generated code
+          easier to read, and
+       2- smaller allocation sizes, which are more frequent, execute
+          fewer instructions before calling the GC. *)
+    let delta_lbl_list =
+      delta_encode (List.rev (IntMap.bindings !call_gc_labels)) in
+    List.iter
+      (fun (delta, lbl) ->
+        `{emit_label lbl}:	addi	31, 31, {emit_int delta}\n`)
+      delta_lbl_list;
    match abi with
    | ELF32 ->
      `	b	{emit_symbol "caml_call_gc"}\n`
--- a/asmcomp/s390x/emit.mlp
+++ b/asmcomp/s390x/emit.mlp
@ -429,10 +429,11 @@ let emit_instr i =
            gc_return_lbl = lbl_redo;
            gc_frame_lbl = lbl_frame } :: !call_gc_sites;
        `{emit_label lbl_redo}:`;
-        `	lay	%r11, {emit_int(-n)}(%r11)\n`;
-        `	clgr	%r11, %r10\n`;
-        `	brcl	4, {emit_label lbl_call_gc}\n`;  (* less than *)
-        `	la	{emit_reg i.res.(0)}, 8(%r11)\n`
+        `	lay	{emit_reg i.res.(0)}, {emit_int(-n+8)}(%r11)\n`;
+        `	clgr	{emit_reg i.res.(0)}, %r10\n`;
+        `	brcl	12, {emit_label lbl_call_gc}\n`;
+                                                 (* less than or equal *)
+        `	lay	%r11, -8({emit_reg i.res.(0)})\n`

    | Lop(Iintop Imulh) ->
       (* Hacker's Delight section 8.3:
--- a/runtime/amd64.S
+++ b/runtime/amd64.S
@ -397,6 +397,7 @@ LBL(caml_alloc1):
        jb      LBL(100)
        ret
 LBL(100):
+        addq    $16, %r15
        RECORD_STACK_FRAME(0)
        ENTER_FUNCTION
 /*        subq    $8, %rsp; CFI_ADJUST (8); */
@ -414,6 +415,7 @@ LBL(caml_alloc2):
        jb      LBL(101)
        ret
 LBL(101):
+        addq    $24, %r15
        RECORD_STACK_FRAME(0)
        ENTER_FUNCTION
 /*        subq    $8, %rsp; CFI_ADJUST (8); */
@ -431,6 +433,7 @@ LBL(caml_alloc3):
        jb      LBL(102)
        ret
 LBL(102):
+        addq    $32, %r15
        RECORD_STACK_FRAME(0)
        ENTER_FUNCTION
 /*        subq    $8, %rsp; CFI_ADJUST (8) */
@ -450,6 +453,7 @@ LBL(caml_allocN):
        addq    $8, %rsp; CFI_ADJUST (-8)  /* drop desired size */
        ret
 LBL(103):
+        addq    0(%rsp), %r15
        CFI_ADJUST(8)
        RECORD_STACK_FRAME(8)
 #ifdef WITH_FRAME_POINTERS
@ -467,6 +471,27 @@ LBL(103):
        jmp     LBL(caml_allocN)
 CFI_ENDPROC

+/* Reset the allocation pointer and invoke the GC */
+
+FUNCTION(G(caml_call_gc1))
+CFI_STARTPROC
+        addq    $16, %r15
+        jmp     GCALL(caml_call_gc)
+CFI_ENDPROC
+
+FUNCTION(G(caml_call_gc2))
+CFI_STARTPROC
+        addq    $24, %r15
+        jmp     GCALL(caml_call_gc)
+CFI_ENDPROC
+
+FUNCTION(G(caml_call_gc3))
+CFI_STARTPROC
+        addq    $32, %r15
+        jmp     GCALL(caml_call_gc)
+CFI_ENDPROC
+
+
 /* Call a C function from OCaml */

 FUNCTION(G(caml_c_call))
--- a/runtime/amd64nt.asm
+++ b/runtime/amd64nt.asm
@ -150,6 +150,7 @@ caml_alloc1:
        jb      L100
        ret
 L100:
+        add     r15, 16
        mov     rax, [rsp + 0]
        mov     caml_last_return_address, rax
        lea     rax, [rsp + 8]
@ -167,6 +168,7 @@ caml_alloc2:
        jb      L101
        ret
 L101:
+        add     r15, 24
        mov     rax, [rsp + 0]
        mov     caml_last_return_address, rax
        lea     rax, [rsp + 8]
@ -184,6 +186,7 @@ caml_alloc3:
        jb      L102
        ret
 L102:
+        add     r15, 32
        mov     rax, [rsp + 0]
        mov     caml_last_return_address, rax
        lea     rax, [rsp + 8]
@ -201,6 +204,7 @@ caml_allocN:
        jb      L103
        ret
 L103:
+        add     r15, rax
        push    rax                       ; save desired size
        mov     rax, [rsp + 8]
        mov     caml_last_return_address, rax
@ -210,6 +214,26 @@ L103:
        pop     rax                      ; recover desired size
        jmp     caml_allocN

+; Reset the allocation pointer and invoke the GC
+
+        PUBLIC  caml_call_gc1
+        ALIGN   16
+caml_call_gc1:
+        add     r15, 16
+        jmp     caml_call_gc
+
+        PUBLIC  caml_call_gc2
+        ALIGN   16
+caml_call_gc2:
+        add     r15, 24
+        jmp     caml_call_gc
+
+        PUBLIC  caml_call_gc3
+        ALIGN 16
+caml_call_gc3:
+        add     r15, 32
+        jmp     caml_call_gc
+
 ; Call a C function from OCaml

        PUBLIC  caml_c_call
--- a/runtime/arm.S
+++ b/runtime/arm.S
@ -163,7 +163,8 @@ caml_alloc1:
        cmp     alloc_ptr, alloc_limit
        bcc     1f
        bx      lr
-1:  /* Record return address */
+1:      add     alloc_ptr, alloc_ptr, 8
+    /* Record return address */
        ldr     r7, =caml_last_return_address
        str     lr, [r7]
    /* Call GC (preserves r7) */
@ -185,7 +186,8 @@ caml_alloc2:
        cmp     alloc_ptr, alloc_limit
        bcc     1f
        bx      lr
-1:  /* Record return address */
+1:      add     alloc_ptr, alloc_ptr, 12
+    /* Record return address */
        ldr     r7, =caml_last_return_address
        str     lr, [r7]
    /* Call GC (preserves r7) */
@ -208,7 +210,8 @@ caml_alloc3:
        cmp     alloc_ptr, alloc_limit
        bcc     1f
        bx      lr
-1:  /* Record return address */
+1:      add     alloc_ptr, alloc_ptr, 16
+    /* Record return address */
        ldr     r7, =caml_last_return_address
        str     lr, [r7]
    /* Call GC (preserves r7) */
@ -230,7 +233,8 @@ caml_allocN:
        cmp     alloc_ptr, alloc_limit
        bcc     1f
        bx      lr
-1:  /* Record return address */
+1:      add     alloc_ptr, alloc_ptr, r7
+    /* Record return address */
        ldr     r12, =caml_last_return_address
        str     lr, [r12]
    /* Call GC (preserves r7) */
--- a/runtime/arm64.S
+++ b/runtime/arm64.S
@ -185,7 +185,8 @@ caml_alloc1:
        cmp     ALLOC_PTR, ALLOC_LIMIT
        b.lo    2f
        ret
-2:      stp     x29, x30, [sp, -16]!
+2:      add     ALLOC_PTR, ALLOC_PTR, #16
+        stp     x29, x30, [sp, -16]!
        CFI_ADJUST(16)
    /* Record the lowest address of the caller's stack frame.  This is the
       address immediately above the pair of words (x29 and x30) we just
@ -217,7 +218,8 @@ caml_alloc2:
        cmp     ALLOC_PTR, ALLOC_LIMIT
        b.lo    2f
        ret
-2:      stp     x29, x30, [sp, -16]!
+2:      add     ALLOC_PTR, ALLOC_PTR, #24
+        stp     x29, x30, [sp, -16]!
        CFI_ADJUST(16)
    /* Record the lowest address of the caller's stack frame.
       See comment above. */
@ -245,7 +247,8 @@ caml_alloc3:
        cmp     ALLOC_PTR, ALLOC_LIMIT
        b.lo    2f
        ret
-2:      stp     x29, x30, [sp, -16]!
+2:      add     ALLOC_PTR, ALLOC_PTR, #32
+        stp     x29, x30, [sp, -16]!
        CFI_ADJUST(16)
    /* Record the lowest address of the caller's stack frame.
       See comment above. */
@ -273,7 +276,8 @@ caml_allocN:
        cmp     ALLOC_PTR, ALLOC_LIMIT
        b.lo    2f
        ret
-2:      stp     x29, x30, [sp, -16]!
+2:      add     ALLOC_PTR, ALLOC_PTR, ARG
+        stp     x29, x30, [sp, -16]!
        CFI_ADJUST(16)
    /* Record the lowest address of the caller's stack frame.
       See comment above. */
--- a/runtime/i386.S
+++ b/runtime/i386.S
@ -113,9 +113,9 @@ FUNCTION(caml_alloc1)
        CFI_STARTPROC
        movl    G(caml_young_ptr), %eax
        subl    $8, %eax
-        movl    %eax, G(caml_young_ptr)
        cmpl    G(caml_young_limit), %eax
        jb      LBL(100)
+        movl    %eax, G(caml_young_ptr)
        ret
 LBL(100):
        movl    0(%esp), %eax
@ -132,9 +132,9 @@ FUNCTION(caml_alloc2)
        CFI_STARTPROC
        movl    G(caml_young_ptr), %eax
        subl    $12, %eax
-        movl    %eax, G(caml_young_ptr)
        cmpl    G(caml_young_limit), %eax
        jb      LBL(101)
+        movl    %eax, G(caml_young_ptr)
        ret
 LBL(101):
        movl    0(%esp), %eax
@ -151,9 +151,9 @@ FUNCTION(caml_alloc3)
        CFI_STARTPROC
        movl    G(caml_young_ptr), %eax
        subl    $16, %eax
-        movl    %eax, G(caml_young_ptr)
        cmpl    G(caml_young_limit), %eax
        jb      LBL(102)
+        movl    %eax, G(caml_young_ptr)
        ret
 LBL(102):
        movl    0(%esp), %eax
@ -178,7 +178,6 @@ LBL(103):
        subl    G(caml_young_ptr), %eax /* eax = - size */
        negl    %eax                    /* eax = size */
        pushl   %eax; CFI_ADJUST(4)     /* save desired size */
-        subl    %eax, G(caml_young_ptr) /* must update young_ptr */
        movl    4(%esp), %eax
        movl    %eax, G(caml_last_return_address)
        leal    8(%esp), %eax
--- a/runtime/i386nt.asm
+++ b/runtime/i386nt.asm
@ -74,9 +74,9 @@ L105:   push    ebp
 _caml_alloc1:
        mov     eax, _caml_young_ptr
        sub     eax, 8
-        mov     _caml_young_ptr, eax
        cmp     eax, _caml_young_limit
        jb      L100
+        mov     _caml_young_ptr, eax
        ret
 L100:   mov     eax, [esp]
        mov     _caml_last_return_address, eax
@ -89,9 +89,9 @@ L100:   mov     eax, [esp]
 _caml_alloc2:
        mov     eax, _caml_young_ptr
        sub     eax, 12
-        mov     _caml_young_ptr, eax
        cmp     eax, _caml_young_limit
        jb      L101
+        mov     _caml_young_ptr, eax
        ret
 L101:   mov     eax, [esp]
        mov     _caml_last_return_address, eax
@ -104,9 +104,9 @@ L101:   mov     eax, [esp]
 _caml_alloc3:
        mov     eax, _caml_young_ptr
        sub     eax, 16
-        mov     _caml_young_ptr, eax
        cmp     eax, _caml_young_limit
        jb      L102
+        mov     _caml_young_ptr, eax
        ret
 L102:   mov     eax, [esp]
        mov     _caml_last_return_address, eax
@ -126,7 +126,6 @@ _caml_allocN:
 L103:   sub     eax, _caml_young_ptr         ; eax = - size
        neg     eax                     ; eax = size
        push    eax                     ; save desired size
-        sub     _caml_young_ptr, eax         ; must update young_ptr
        mov     eax, [esp+4]
        mov     _caml_last_return_address, eax
        lea     eax, [esp+8]
--- a/stdlib/gc.mli
+++ b/stdlib/gc.mli
@ -18,9 +18,7 @@
 type stat =
  { minor_words : float;
    (** Number of words allocated in the minor heap since
-       the program was started.  This number is accurate in
-       byte-code programs, but only an approximation in programs
-       compiled to native code. *)
+       the program was started. *)

    promoted_words : float;
    (** Number of words allocated in the minor heap that
--- a/testsuite/tests/regression/pr7798/ocamltests
+++ b/testsuite/tests/regression/pr7798/ocamltests
@ -0,0 +1 @@
+pr7798.ml
--- a/testsuite/tests/regression/pr7798/pr7798.ml
+++ b/testsuite/tests/regression/pr7798/pr7798.ml
@ -0,0 +1,57 @@
+(* TEST
+   * bytecode
+   * native
+   * native
+     ocamlopt_flags = "-compact"
+*)
+
+type mut2 = { mutable p: int; mutable q:int }
+type mut3 = { mutable s: int; mutable t:int; mutable u:int }
+
+type mut_record =
+  { mutable a : int;
+    mutable b : int;
+    mutable c : int;
+    mutable d : int;
+    mutable e : int;
+    mutable f : int; }
+
+let go () =
+  let pre_before = Gc.minor_words () in
+  let before = Gc.minor_words () in
+  let alloc_per_minor_words = int_of_float (before -. pre_before) in
+  if Sys.backend_type = Sys.Native then assert (alloc_per_minor_words = 0);
+  let allocs = ref alloc_per_minor_words in
+  let n = 1_000_000 in
+  for i = 1 to n do
+    Sys.opaque_identity (ref i)
+    |> ignore;
+    allocs := !allocs + 2;
+  done;
+  for i = 1 to n do
+    Sys.opaque_identity { p = i; q = i }
+    |> ignore;
+    allocs := !allocs + 3;
+  done;
+  for i = 1 to n do
+    Sys.opaque_identity { s = i; t = i; u = i }
+    |> ignore;
+    allocs := !allocs + 4;
+  done;
+  for i = 1 to n do
+    Sys.opaque_identity { a = i; b = i; c = i; d = i; e = i; f = i }
+    |> ignore;
+    allocs := !allocs + 7;
+    if i mod (n/3) == 0 then Gc.full_major ();
+  done;
+  for i = 1 to n do
+    Sys.opaque_identity (Array.make 8 i)
+    |> ignore;
+    allocs := !allocs + 9;
+    if i mod (n/3) == 0 then Gc.compact ();
+  done;
+  let after = Gc.minor_words () in
+  let measured_allocs = int_of_float (after -. before) - alloc_per_minor_words in
+  Printf.printf "%d\n" (measured_allocs - !allocs)
+
+let () = go ()
--- a/testsuite/tests/regression/pr7798/pr7798.reference
+++ b/testsuite/tests/regression/pr7798/pr7798.reference
@ -0,0 +1 @@
+0