diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-08-05 17:04:01 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-08-05 17:04:01 -0300 |
commit | 57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch) | |
tree | 5e910f0e82173f4ef4f51111366a3f1299037a7b /arch/m32r/lib/checksum.S |
Initial import
Diffstat (limited to 'arch/m32r/lib/checksum.S')
-rw-r--r-- | arch/m32r/lib/checksum.S | 320 |
1 files changed, 320 insertions, 0 deletions
diff --git a/arch/m32r/lib/checksum.S b/arch/m32r/lib/checksum.S new file mode 100644 index 000000000..0af0360c7 --- /dev/null +++ b/arch/m32r/lib/checksum.S @@ -0,0 +1,320 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Tom May, <ftom@netcom.com> + * Pentium Pro/II routines: + * Alexander Kjeldaas <astor@guardian.no> + * Finn Arne Gangstad <finnag@guardian.no> + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception + * handling. + * Andi Kleen, add zeroing on error + * converted to pure assembler + * Hirokazu Takata,Hiroyuki Kondo rewrite for the m32r architecture. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/errno.h> + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +/* +unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) + */ + + +#ifdef CONFIG_ISA_DUAL_ISSUE + + /* + * Experiments with Ethernet and SLIP connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. We get at + * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. + * Fortunately, it is easy to convert 2-byte alignment to 4-byte + * alignment for the unrolled loop. + */ + + .text +ENTRY(csum_partial) + ; Function args + ; r0: unsigned char *buff + ; r1: int len + ; r2: unsigned int sum + + push r2 || ldi r2, #0 + and3 r7, r0, #1 ; Check alignment. + beqz r7, 1f ; Jump if alignment is ok. + ; 1-byte mis aligned + ldub r4, @r0 || addi r0, #1 + ; clear c-bit || Alignment uses up bytes. + cmp r0, r0 || addi r1, #-1 + ldi r3, #0 || addx r2, r4 + addx r2, r3 + .fillinsn +1: + and3 r4, r0, #2 ; Check alignment. + beqz r4, 2f ; Jump if alignment is ok. + ; clear c-bit || Alignment uses up two bytes. + cmp r0, r0 || addi r1, #-2 + bgtz r1, 1f ; Jump if we had at least two bytes. + bra 4f || addi r1, #2 + .fillinsn ; len(r1) was < 2. Deal with it. +1: + ; 2-byte aligned + lduh r4, @r0 || ldi r3, #0 + addx r2, r4 || addi r0, #2 + addx r2, r3 + .fillinsn +2: + ; 4-byte aligned + cmp r0, r0 ; clear c-bit + srl3 r6, r1, #5 + beqz r6, 2f + .fillinsn + +1: ld r3, @r0+ + ld r4, @r0+ ; +4 + ld r5, @r0+ ; +8 + ld r3, @r0+ || addx r2, r3 ; +12 + ld r4, @r0+ || addx r2, r4 ; +16 + ld r5, @r0+ || addx r2, r5 ; +20 + ld r3, @r0+ || addx r2, r3 ; +24 + ld r4, @r0+ || addx r2, r4 ; +28 + addx r2, r5 || addi r6, #-1 + addx r2, r3 + addx r2, r4 + bnez r6, 1b + + addx r2, r6 ; r6=0 + cmp r0, r0 ; This clears c-bit + .fillinsn +2: and3 r6, r1, #0x1c ; withdraw len + beqz r6, 4f + srli r6, #2 + .fillinsn + +3: ld r4, @r0+ || addi r6, #-1 + addx r2, r4 + bnez r6, 3b + + addx r2, r6 ; r6=0 + cmp r0, r0 ; This clears c-bit + .fillinsn +4: and3 r1, r1, #3 + beqz r1, 7f ; if len == 0 goto end + and3 r6, r1, #2 + beqz r6, 5f ; if len < 2 goto 5f(1byte) + lduh r4, @r0 || addi r0, #2 + addi r1, #-2 || slli r4, #16 + addx r2, r4 + beqz r1, 6f + .fillinsn +5: ldub r4, @r0 || ldi r1, #0 +#ifndef __LITTLE_ENDIAN__ + slli r4, #8 +#endif + addx r2, r4 + .fillinsn +6: addx r2, r1 + .fillinsn +7: + and3 r0, r2, #0xffff + srli r2, #16 + add r0, r2 + srl3 r2, r0, #16 + beqz r2, 1f + addi r0, #1 + and3 r0, r0, #0xffff + .fillinsn +1: + beqz r7, 1f ; swap the upper byte for the lower + and3 r2, r0, #0xff + srl3 r0, r0, #8 + slli r2, #8 + or r0, r2 + .fillinsn +1: + pop r2 || cmp r0, r0 + addx r0, r2 || ldi r2, #0 + addx r0, r2 + jmp r14 + +#else /* not CONFIG_ISA_DUAL_ISSUE */ + + /* + * Experiments with Ethernet and SLIP connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. We get at + * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. + * Fortunately, it is easy to convert 2-byte alignment to 4-byte + * alignment for the unrolled loop. + */ + + .text +ENTRY(csum_partial) + ; Function args + ; r0: unsigned char *buff + ; r1: int len + ; r2: unsigned int sum + + push r2 + ldi r2, #0 + and3 r7, r0, #1 ; Check alignment. + beqz r7, 1f ; Jump if alignment is ok. + ; 1-byte mis aligned + ldub r4, @r0 + addi r0, #1 + addi r1, #-1 ; Alignment uses up bytes. + cmp r0, r0 ; clear c-bit + ldi r3, #0 + addx r2, r4 + addx r2, r3 + .fillinsn +1: + and3 r4, r0, #2 ; Check alignment. + beqz r4, 2f ; Jump if alignment is ok. + addi r1, #-2 ; Alignment uses up two bytes. + cmp r0, r0 ; clear c-bit + bgtz r1, 1f ; Jump if we had at least two bytes. + addi r1, #2 ; len(r1) was < 2. Deal with it. + bra 4f + .fillinsn +1: + ; 2-byte aligned + lduh r4, @r0 + addi r0, #2 + ldi r3, #0 + addx r2, r4 + addx r2, r3 + .fillinsn +2: + ; 4-byte aligned + cmp r0, r0 ; clear c-bit + srl3 r6, r1, #5 + beqz r6, 2f + .fillinsn + +1: ld r3, @r0+ + ld r4, @r0+ ; +4 + ld r5, @r0+ ; +8 + addx r2, r3 + addx r2, r4 + addx r2, r5 + ld r3, @r0+ ; +12 + ld r4, @r0+ ; +16 + ld r5, @r0+ ; +20 + addx r2, r3 + addx r2, r4 + addx r2, r5 + ld r3, @r0+ ; +24 + ld r4, @r0+ ; +28 + addi r6, #-1 + addx r2, r3 + addx r2, r4 + bnez r6, 1b + addx r2, r6 ; r6=0 + cmp r0, r0 ; This clears c-bit + .fillinsn + +2: and3 r6, r1, #0x1c ; withdraw len + beqz r6, 4f + srli r6, #2 + .fillinsn + +3: ld r4, @r0+ + addi r6, #-1 + addx r2, r4 + bnez r6, 3b + addx r2, r6 ; r6=0 + cmp r0, r0 ; This clears c-bit + .fillinsn + +4: and3 r1, r1, #3 + beqz r1, 7f ; if len == 0 goto end + and3 r6, r1, #2 + beqz r6, 5f ; if len < 2 goto 5f(1byte) + + lduh r4, @r0 + addi r0, #2 + addi r1, #-2 + slli r4, #16 + addx r2, r4 + beqz r1, 6f + .fillinsn +5: ldub r4, @r0 +#ifndef __LITTLE_ENDIAN__ + slli r4, #8 +#endif + addx r2, r4 + .fillinsn +6: ldi r5, #0 + addx r2, r5 + .fillinsn +7: + and3 r0, r2, #0xffff + srli r2, #16 + add r0, r2 + srl3 r2, r0, #16 + beqz r2, 1f + addi r0, #1 + and3 r0, r0, #0xffff + .fillinsn +1: + beqz r7, 1f + mv r2, r0 + srl3 r0, r2, #8 + and3 r2, r2, #0xff + slli r2, #8 + or r0, r2 + .fillinsn +1: + pop r2 + cmp r0, r0 + addx r0, r2 + ldi r2, #0 + addx r0, r2 + jmp r14 + +#endif /* not CONFIG_ISA_DUAL_ISSUE */ + +/* +unsigned int csum_partial_copy_generic (const char *src, char *dst, + int len, int sum, int *src_err_ptr, int *dst_err_ptr) + */ + +/* + * Copy from ds while checksumming, otherwise like csum_partial + * + * The macros SRC and DST specify the type of access for the instruction. + * thus we can call a custom exception handler for all access types. + * + * FIXME: could someone double-check whether I haven't mixed up some SRC and + * DST definitions? It's damn hard to trigger all cases. I hope I got + * them all but there's no guarantee. + */ + +ENTRY(csum_partial_copy_generic) + nop + nop + nop + nop + jmp r14 + nop + nop + nop + + .end |