Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Mon, 29 Jul 2013 02:04:06 +0200
From: Katja Malvoni <>
Subject: Re: Parallella: bcrypt

Hi Yaniv, Alexander,

I have following code which is supposed to replace do while loop in

    unsigned int offs1, offs2;
    offs1 = ctx1->s.P - ctx0->s.P;
    offs2 = (ctx1->s.P - ctx0->s.P) + 1;
__asm__ __volatile__(
            "loop2: add r45, %[ctx0], 0x4\n"
            "add r46, %[ctx1], 0x4\n"
            "eor %[L0], %[P00], %[L0]\n"
            "eor %[L1], %[P10], %[L1]\n"
            "eor r22, %[R0], %[P017]\n"
            "str r22, [%[ptr]]\n"
            "str %[L0], [%[ptr], +0x1]\n"
            "eor r23, %[R1], %[P117]\n"
            "str r23, [%[ptr], +%[offs1]]\n"
            "str %[L1], [%[ptr], +%[offs2]]\n"
            "add %[ptr], %[ptr], 8\n"
            "sub r24, %[end], %[ptr]\n"
            "mov %[R0], %[L0]\n"
            "mov %[L0], r22\n"
            "mov %[R1], %[L1]\n"
            "mov %[L1], r23\n"
            "bgtu loop2\n"

The problem is that I get more or less same objdump (differences are in
assigned registers and irrelevant differences in instructions order) with
loop being in C and loop being in assembly but when loop is in assembly I
get incorrect result. I spent a lot of time trying to figure out where is
mistake and I can't find it. But I need a way to preload P arrays in
assembly and implementing whole loop in assembly was my only idea. The same
approach for implementing loop works in BF_encrypt(). If I copy paste code
from that loop, I get incorrect result too.

I pasted below only relevant lines of both objdumps i.e. last update of
used register (except for L0, L1, R0, R1 which are updated in BF2_2ROUND)
and parts before and after BF2_2ROUND. I can't find difference in these two
objdumps that would cause one producing correct results and another one

When loop and storing Ls and Rs is in C compiler generated code is:
sub r53,r1,r0 ; sub r53, ctx1, ctx0
asr r53,r53,0x2
lsl r57,r53,0x2 ; r57 = offs1
add r58,r57,4; r58 = offs2
add r3,r0,72
mov r2, r3 ; r2 = ptr
ldr r4,[r0,+0x11] ; r4 = P0[17]
ldr r59,[r1,+0x11] ; r59 = P1[17]
mov r49,0x1048
add r49,r0,r49 ; r49 = end
lsr r22,r48,0x6
eor r12,r12,r23
lsr r62,r47,0x6
eor r26,r26,r63
1818: mov r48,r12 ; mov R0, L0
mov r47,r26 ; mov R1, L1
mov r12,r22 ; mov L0, r22
mov r26,r23 ; mov L1, r23
add r45,r0,4
add r46,r1,4
eor r22,r48,r4 ; eor r22, R0, P0[17]
str r22,[r2] ; str r22, ptr
str r12,[r2,+0x1] ; str L0, ptr+1
eor r23,r47,r59 ; eor r23, R1, P1[17]
str r23,[r2,+r57] ; str r23, ptr+offs1
str r26,[r2,+r58] ; str L1, ptr+offs2
add r2,r2,8 ; add ptr, ptr, 8
sub r24,r49,r2 ; sub r24, end, ptr
bgtu 1818 <__HALF_BANK_SIZE_+0x818>

When loop is in assembly:

sub r54,r1,r0 ; r54 = offs1
add r57,r54,4 ; r57 = offs2
add r18,r0,72 ; r18 = ptr
ldr r55,[r0,+0x11] ; r55 = P0[17]
ldr r56,[r1,+0x11] ; r56 = P1[17]
mov r17,0x1048
add r17,r0,r17 ; r17 = end
lsr r22,r3,0x6
eor r12,r12,r23
lsr r62,r2,0x6
eor r16,r16,r63
loop2: add r45,r0,4
add r46,r1,4
eor r22,r3,r55 ; eor r22, R0, P0[17]
str r22,[r18] ; str r22, ptr
str r12,[r18,+0x1] ; str L0, ptr+1
eor r23,r2,r56 : eor r23, R1, P1[17]
str r23,[r18,+r54] ; str r23, ptr+offs1
str r16,[r18,+r57] ; str L1, ptr+offs2
add r18,r18,8 ; add ptr, ptr, 8
sub r24,r17,r18 ; sub r24, end, ptr
mov r3,r12 ; mov R0, L0
mov r12,r22 ; mov L0, r22
mov r2,r16 ; mov R1, L1
mov r16,r23 ; mov L1, r23
bgtu loop2



Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ