Prev: ugly?
Next: comment on my, code, Im little bit rusty , got c++ code finally;) (complete source for interested)
From: Branimir Maksimovic on 17 Mar 2010 02:49 On Fri, 12 Mar 2010 08:14:30 +0100 Branimir Maksimovic <bmaxa(a)hotmail.com> wrote: I had wrong count when sorting frequencies. Didn;t show up as when there is only one element in a raw, wasn;t issue. Used SSE2 to reduce pressure on stack, but there is no noticeable difference. Could be faster with SSE4 but then wont work on anything bellow penryn ;) Greets Bug fix first: macro frequencies1 { local l1,l2,l3,e1 mov ecx,dword[hashtable.elements] cmp ecx,0 jz e1 mov ebx,dword[hashtable.data] l1: cmp dword[ebx],0 jz l3 push ebx ecx mov ebx,dword[ebx] mov ecx,dword[ebx] if 0 pusha ccall printf,fmt6,ecx,dword[ebx+4] popa end if add ebx,4 sub dword[esp],ecx ; was bug, ecx wasnt counted ; when raw had more than 1 l2: push ebx ecx find dword[sortedtable.data],sortedtable.elements,8,ebx pop ecx ebx add ebx,16 dec ecx jnz l2 pop ecx ebx and ecx,ecx ; no decrement here jz e1 l3: add ebx,4 jmp l1 e1: } sse2 version: macro hash str,size { local l1,l2,l3 mov ecx,size mov ebx,str xor eax,eax mov esi,16 mov edi,2 l1: shl eax,2 movzx edx,byte [ebx] or eax,edx inc ebx dec esi jnz l2 pslldq xmm1,4 movdqa xmm2,xmm1 movd xmm1,eax por xmm1,xmm2 xor eax,eax mov esi,16 dec edi l2: dec ecx jnz l1 l3: pslldq xmm1,4 movdqa xmm2,xmm1 movd xmm1, eax por xmm1,xmm2 dec edi jnz l3 } macro hashfind data,elements,block,srchstr,srchlen { mov eax,srchstr movd xmm0,eax hash srchstr,srchlen mov ebx,data strfind elements,block } macro strfind elements,block { local l1,l2,l3,l4,l5,s1,e1 movdqa xmm2,xmm1 psrldq xmm2,4 ; pextrd , sse4 ,faster but won;t work on amd movd eax,xmm2 xor esi,esi and eax,0x1ffff ; increase size of hashtable gain 1 sec aprox shl eax,2 movd xmm2,eax movd xmm3,ebx cmp dword[ebx+eax],0 jne l3 l1: ; allocate s1: mov ebx,1 xor eax,eax lock cmpxchg dword[sema],ebx ; test and set and eax,eax jnz s1 add esi,20 movd eax,xmm2 movd ebx,xmm3 ccall realloc,dword[ebx+eax],esi ; realloc is not thread safe lock and dword[sema],0 ; reset mov esi,eax and esi,esi jz e2 movd eax,xmm2 movd ebx,xmm3 cmp dword[ebx+eax],0 mov dword[ebx+eax],esi jne l2 mov esi, dword[ebx+eax] mov dword[esi],0 l2: mov ebx,dword[ebx+eax] add ebx,4 mov eax,dword[ebx-4] imul eax,16 mov dword[ebx+eax],0 movd dword[ebx+eax+4],xmm0 movq [ebx+eax+8],xmm1 inc dword[elements] inc dword[ebx-4] jmp e1 ;search l3: mov ebx,dword[ebx+eax] add ebx,4 xor eax,eax l4: mov esi,dword[ebx-4] imul esi,16 cmp eax,esi jge l1 ; we need to reallocate movq xmm4,[ebx+eax+8] pcmpeqd xmm4,xmm1 ; with sse4.1 can be done faster pcmpeqq movd esi,xmm4 and esi,esi jz l5 psrldq xmm4,4 movd esi,xmm4 and esi,esi jnz e1 l5: add eax,16 jmp l4 e1: lea eax,[ebx+eax] } -- http://maxa.homedns.org/ Sometimes online sometimes not |