From: Branimir Maksimovic on
On Fri, 12 Mar 2010 08:14:30 +0100
Branimir Maksimovic <bmaxa(a)hotmail.com> wrote:

I had wrong count when sorting frequencies. Didn;t
show up as when there is only one element in a raw, wasn;t issue.
Used SSE2 to reduce pressure on stack, but there is no
noticeable difference. Could be faster with SSE4 but then
wont work on anything bellow penryn ;)


Greets

Bug fix first:

macro frequencies1
{
local l1,l2,l3,e1
mov ecx,dword[hashtable.elements]
cmp ecx,0
jz e1
mov ebx,dword[hashtable.data]
l1:
cmp dword[ebx],0
jz l3
push ebx ecx
mov ebx,dword[ebx]
mov ecx,dword[ebx]

if 0
pusha
ccall printf,fmt6,ecx,dword[ebx+4]
popa
end if

add ebx,4
sub dword[esp],ecx ; was bug, ecx wasnt counted
; when raw had more than 1
l2:
push ebx ecx
find dword[sortedtable.data],sortedtable.elements,8,ebx
pop ecx ebx
add ebx,16
dec ecx
jnz l2
pop ecx ebx
and ecx,ecx ; no decrement here
jz e1
l3:
add ebx,4
jmp l1
e1:
}


sse2 version:

macro hash str,size
{
local l1,l2,l3
mov ecx,size
mov ebx,str
xor eax,eax
mov esi,16
mov edi,2
l1:
shl eax,2
movzx edx,byte [ebx]
or eax,edx
inc ebx
dec esi
jnz l2
pslldq xmm1,4
movdqa xmm2,xmm1
movd xmm1,eax
por xmm1,xmm2
xor eax,eax
mov esi,16
dec edi
l2:
dec ecx
jnz l1
l3:
pslldq xmm1,4
movdqa xmm2,xmm1
movd xmm1, eax
por xmm1,xmm2
dec edi
jnz l3
}

macro hashfind data,elements,block,srchstr,srchlen
{
mov eax,srchstr
movd xmm0,eax
hash srchstr,srchlen
mov ebx,data
strfind elements,block
}

macro strfind elements,block
{
local l1,l2,l3,l4,l5,s1,e1
movdqa xmm2,xmm1
psrldq xmm2,4 ; pextrd , sse4 ,faster but won;t work on amd
movd eax,xmm2
xor esi,esi
and eax,0x1ffff ; increase size of hashtable gain 1 sec aprox
shl eax,2
movd xmm2,eax
movd xmm3,ebx
cmp dword[ebx+eax],0
jne l3
l1:
; allocate
s1:
mov ebx,1
xor eax,eax
lock cmpxchg dword[sema],ebx ; test and set
and eax,eax
jnz s1
add esi,20
movd eax,xmm2
movd ebx,xmm3
ccall realloc,dword[ebx+eax],esi ; realloc is not thread safe
lock and dword[sema],0 ; reset
mov esi,eax
and esi,esi
jz e2
movd eax,xmm2
movd ebx,xmm3
cmp dword[ebx+eax],0
mov dword[ebx+eax],esi
jne l2
mov esi, dword[ebx+eax]
mov dword[esi],0
l2:
mov ebx,dword[ebx+eax]
add ebx,4
mov eax,dword[ebx-4]
imul eax,16
mov dword[ebx+eax],0
movd dword[ebx+eax+4],xmm0
movq [ebx+eax+8],xmm1
inc dword[elements]
inc dword[ebx-4]
jmp e1
;search
l3:
mov ebx,dword[ebx+eax]
add ebx,4
xor eax,eax

l4:
mov esi,dword[ebx-4]
imul esi,16
cmp eax,esi
jge l1 ; we need to reallocate
movq xmm4,[ebx+eax+8]
pcmpeqd xmm4,xmm1 ; with sse4.1 can be done faster pcmpeqq

movd esi,xmm4
and esi,esi
jz l5
psrldq xmm4,4
movd esi,xmm4
and esi,esi
jnz e1

l5:
add eax,16
jmp l4
e1:
lea eax,[ebx+eax]
}


--
http://maxa.homedns.org/

Sometimes online sometimes not