151 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			151 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /* Copyright 2002 Andi Kleen */
 | |
| 
 | |
| #include <linux/linkage.h>
 | |
| 
 | |
| #include <asm/cpufeature.h>
 | |
| #include <asm/dwarf2.h>
 | |
| 
 | |
| /*
 | |
|  * memcpy - Copy a memory block.
 | |
|  *
 | |
|  * Input:
 | |
|  *  rdi destination
 | |
|  *  rsi source
 | |
|  *  rdx count
 | |
|  *
 | |
|  * Output:
 | |
|  * rax original destination
 | |
|  */
 | |
| 
 | |
| /*
 | |
|  * memcpy_c() - fast string ops (REP MOVSQ) based variant.
 | |
|  *
 | |
|  * Calls to this get patched into the kernel image via the
 | |
|  * alternative instructions framework:
 | |
|  */
 | |
| 	ALIGN
 | |
| memcpy_c:
 | |
| 	CFI_STARTPROC
 | |
| 	movq %rdi, %rax
 | |
| 
 | |
| 	movl %edx, %ecx
 | |
| 	shrl $3, %ecx
 | |
| 	andl $7, %edx
 | |
| 	rep movsq
 | |
| 	movl %edx, %ecx
 | |
| 	rep movsb
 | |
| 	ret
 | |
| 	CFI_ENDPROC
 | |
| ENDPROC(memcpy_c)
 | |
| 
 | |
| ENTRY(__memcpy)
 | |
| ENTRY(memcpy)
 | |
| 	CFI_STARTPROC
 | |
| 
 | |
| 	/*
 | |
| 	 * Put the number of full 64-byte blocks into %ecx.
 | |
| 	 * Tail portion is handled at the end:
 | |
| 	 */
 | |
| 	movq %rdi, %rax
 | |
| 	movl %edx, %ecx
 | |
| 	shrl   $6, %ecx
 | |
| 	jz .Lhandle_tail
 | |
| 
 | |
| 	.p2align 4
 | |
| .Lloop_64:
 | |
| 	/*
 | |
| 	 * We decrement the loop index here - and the zero-flag is
 | |
| 	 * checked at the end of the loop (instructions inbetween do
 | |
| 	 * not change the zero flag):
 | |
| 	 */
 | |
| 	decl %ecx
 | |
| 
 | |
| 	/*
 | |
| 	 * Move in blocks of 4x16 bytes:
 | |
| 	 */
 | |
| 	movq 0*8(%rsi),		%r11
 | |
| 	movq 1*8(%rsi),		%r8
 | |
| 	movq %r11,		0*8(%rdi)
 | |
| 	movq %r8,		1*8(%rdi)
 | |
| 
 | |
| 	movq 2*8(%rsi),		%r9
 | |
| 	movq 3*8(%rsi),		%r10
 | |
| 	movq %r9,		2*8(%rdi)
 | |
| 	movq %r10,		3*8(%rdi)
 | |
| 
 | |
| 	movq 4*8(%rsi),		%r11
 | |
| 	movq 5*8(%rsi),		%r8
 | |
| 	movq %r11,		4*8(%rdi)
 | |
| 	movq %r8,		5*8(%rdi)
 | |
| 
 | |
| 	movq 6*8(%rsi),		%r9
 | |
| 	movq 7*8(%rsi),		%r10
 | |
| 	movq %r9,		6*8(%rdi)
 | |
| 	movq %r10,		7*8(%rdi)
 | |
| 
 | |
| 	leaq 64(%rsi), %rsi
 | |
| 	leaq 64(%rdi), %rdi
 | |
| 
 | |
| 	jnz  .Lloop_64
 | |
| 
 | |
| .Lhandle_tail:
 | |
| 	movl %edx, %ecx
 | |
| 	andl  $63, %ecx
 | |
| 	shrl   $3, %ecx
 | |
| 	jz   .Lhandle_7
 | |
| 
 | |
| 	.p2align 4
 | |
| .Lloop_8:
 | |
| 	decl %ecx
 | |
| 	movq (%rsi),		%r8
 | |
| 	movq %r8,		(%rdi)
 | |
| 	leaq 8(%rdi),		%rdi
 | |
| 	leaq 8(%rsi),		%rsi
 | |
| 	jnz  .Lloop_8
 | |
| 
 | |
| .Lhandle_7:
 | |
| 	movl %edx, %ecx
 | |
| 	andl $7, %ecx
 | |
| 	jz .Lend
 | |
| 
 | |
| 	.p2align 4
 | |
| .Lloop_1:
 | |
| 	movb (%rsi), %r8b
 | |
| 	movb %r8b, (%rdi)
 | |
| 	incq %rdi
 | |
| 	incq %rsi
 | |
| 	decl %ecx
 | |
| 	jnz .Lloop_1
 | |
| 
 | |
| .Lend:
 | |
| 	ret
 | |
| 	CFI_ENDPROC
 | |
| ENDPROC(memcpy)
 | |
| ENDPROC(__memcpy)
 | |
| 
 | |
| 	/*
 | |
| 	 * Some CPUs run faster using the string copy instructions.
 | |
| 	 * It is also a lot simpler. Use this when possible:
 | |
| 	 */
 | |
| 
 | |
| 	.section .altinstr_replacement, "ax"
 | |
| 1:	.byte 0xeb				/* jmp <disp8> */
 | |
| 	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
 | |
| 2:
 | |
| 	.previous
 | |
| 
 | |
| 	.section .altinstructions, "a"
 | |
| 	.align 8
 | |
| 	.quad memcpy
 | |
| 	.quad 1b
 | |
| 	.byte X86_FEATURE_REP_GOOD
 | |
| 
 | |
| 	/*
 | |
| 	 * Replace only beginning, memcpy is used to apply alternatives,
 | |
| 	 * so it is silly to overwrite itself with nops - reboot is the
 | |
| 	 * only outcome...
 | |
| 	 */
 | |
| 	.byte 2b - 1b
 | |
| 	.byte 2b - 1b
 | |
| 	.previous
 |