From: Mathieu Desnoyers on 8 Jul 2010 02:30 * Ma, Ling (ling.ma(a)intel.com) wrote: > Hi Mathieu > > I did git revert as bellow > Revert "x86, mem: Optimize memcpy by avoiding memory false dependece" > > This reverts commit 77ff5e49c20bb72a99a12bfb00dbf90b2302f087. > And I found the machine will crash with your config file > > Could you revert the patch : 77ff5e49c20bb72a99a12bfb00dbf90b2302f087 on your > machine, and check it again ? Hi Ling, I just successfully booted my machine 3 times in a row with a revert of commit a1e5278e40f16a4611264f8da9e557c16cb6f6ed from -tip. This is actually the same as reverting commit 77ff5e49c20bb72a99a12bfb00dbf90b2302f087, as a1e5278e40f16a4611264f8da9e557c16cb6f6ed is simply a merge of that commit (the diffs are exactly the same). Without this revert, the machine gets to: Loading Linux 2.6.35-rc4-tip-trace+ ... and reboots all by itself. I've let it reboot 3 times in a row just to make sure. So I am therefore certain that the faulty commit is "x86, mem: Optimize memcpy by avoiding memory false dependece" identified by commit id 77ff5e49c20bb72a99a12bfb00dbf90b2302f087, merged by commit a1e5278e40f16a4611264f8da9e557c16cb6f6ed into -tip. For the sake of the commit log, the affected machine is an Intel Xeon E5405. Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com> CC: "Ma, Ling" <ling.ma(a)intel.com> CC: "mingo(a)elte.hu" <mingo(a)elte.hu> CC: "H. Peter Anvin" <hpa(a)zytor.com> --- arch/x86/lib/memcpy_64.S | 156 ++++++++++++++++------------------------------- 1 file changed, 54 insertions(+), 102 deletions(-) Index: linux.trees.git/arch/x86/lib/memcpy_64.S =================================================================== --- linux.trees.git.orig/arch/x86/lib/memcpy_64.S 2010-07-08 02:11:58.000000000 -0400 +++ linux.trees.git/arch/x86/lib/memcpy_64.S 2010-07-08 02:20:48.000000000 -0400 @@ -40,132 +40,84 @@ ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC - movq %rdi, %rax /* - * Use 32bit CMP here to avoid long NOP padding. + * Put the number of full 64-byte blocks into %ecx. + * Tail portion is handled at the end: */ - cmp $0x20, %edx - jb .Lhandle_tail + movq %rdi, %rax + movl %edx, %ecx + shrl $6, %ecx + jz .Lhandle_tail + .p2align 4 +.Lloop_64: /* - * We check whether memory false dependece could occur, - * then jump to corresponding copy mode. + * We decrement the loop index here - and the zero-flag is + * checked at the end of the loop (instructions inbetween do + * not change the zero flag): */ - cmp %dil, %sil - jl .Lcopy_backward - subl $0x20, %edx -.Lcopy_forward_loop: - subq $0x20, %rdx + decl %ecx /* - * Move in blocks of 4x8 bytes: + * Move in blocks of 4x16 bytes: */ - movq 0*8(%rsi), %r8 - movq 1*8(%rsi), %r9 - movq 2*8(%rsi), %r10 - movq 3*8(%rsi), %r11 - leaq 4*8(%rsi), %rsi - - movq %r8, 0*8(%rdi) - movq %r9, 1*8(%rdi) - movq %r10, 2*8(%rdi) - movq %r11, 3*8(%rdi) - leaq 4*8(%rdi), %rdi - jae .Lcopy_forward_loop - addq $0x20, %rdx - jmp .Lhandle_tail + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r8 + movq %r11, 0*8(%rdi) + movq %r8, 1*8(%rdi) -.Lcopy_backward: - /* - * Calculate copy position to tail. - */ - addq %rdx, %rsi - addq %rdx, %rdi - subq $0x20, %rdx - /* - * At most 3 ALU operations in one cycle, - * so append NOPS in the same 16bytes trunk. - */ - .p2align 4 -.Lcopy_backward_loop: - subq $0x20, %rdx - movq -1*8(%rsi), %r8 - movq -2*8(%rsi), %r9 - movq -3*8(%rsi), %r10 - movq -4*8(%rsi), %r11 - leaq -4*8(%rsi), %rsi - movq %r8, -1*8(%rdi) - movq %r9, -2*8(%rdi) - movq %r10, -3*8(%rdi) - movq %r11, -4*8(%rdi) - leaq -4*8(%rdi), %rdi - jae .Lcopy_backward_loop + movq 2*8(%rsi), %r9 + movq 3*8(%rsi), %r10 + movq %r9, 2*8(%rdi) + movq %r10, 3*8(%rdi) + + movq 4*8(%rsi), %r11 + movq 5*8(%rsi), %r8 + movq %r11, 4*8(%rdi) + movq %r8, 5*8(%rdi) + + movq 6*8(%rsi), %r9 + movq 7*8(%rsi), %r10 + movq %r9, 6*8(%rdi) + movq %r10, 7*8(%rdi) + + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi + + jnz .Lloop_64 - /* - * Calculate copy position to head. - */ - addq $0x20, %rdx - subq %rdx, %rsi - subq %rdx, %rdi .Lhandle_tail: - cmpq $16, %rdx - jb .Lless_16bytes + movl %edx, %ecx + andl $63, %ecx + shrl $3, %ecx + jz .Lhandle_7 - /* - * Move data from 16 bytes to 31 bytes. - */ - movq 0*8(%rsi), %r8 - movq 1*8(%rsi), %r9 - movq -2*8(%rsi, %rdx), %r10 - movq -1*8(%rsi, %rdx), %r11 - movq %r8, 0*8(%rdi) - movq %r9, 1*8(%rdi) - movq %r10, -2*8(%rdi, %rdx) - movq %r11, -1*8(%rdi, %rdx) - retq .p2align 4 -.Lless_16bytes: - cmpq $8, %rdx - jb .Lless_8bytes - /* - * Move data from 8 bytes to 15 bytes. - */ - movq 0*8(%rsi), %r8 - movq -1*8(%rsi, %rdx), %r9 - movq %r8, 0*8(%rdi) - movq %r9, -1*8(%rdi, %rdx) - retq - .p2align 4 -.Lless_8bytes: - cmpq $4, %rdx - jb .Lless_3bytes +.Lloop_8: + decl %ecx + movq (%rsi), %r8 + movq %r8, (%rdi) + leaq 8(%rdi), %rdi + leaq 8(%rsi), %rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx, %ecx + andl $7, %ecx + jz .Lend - /* - * Move data from 4 bytes to 7 bytes. - */ - movl (%rsi), %ecx - movl -4(%rsi, %rdx), %r8d - movl %ecx, (%rdi) - movl %r8d, -4(%rdi, %rdx) - retq .p2align 4 -.Lless_3bytes: - cmpl $0, %edx - je .Lend - /* - * Move data from 1 bytes to 3 bytes. - */ .Lloop_1: movb (%rsi), %r8b movb %r8b, (%rdi) incq %rdi incq %rsi - decl %edx + decl %ecx jnz .Lloop_1 .Lend: - retq + ret CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) -- Mathieu Desnoyers Operating System Efficiency R&D Consultant EfficiOS Inc. http://www.efficios.com -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
|
Pages: 1 Prev: cfq-iosched: Implement cfq group idling Next: [PATCH -mm] memcg: avoid css_get |