	.586
	.mmx
	.model	flat
	
	.code
__start:


;	EBP	bit bucket
;	ESI	bit source
;	CL	24 - bits in bucket

FILLBITS	macro
	local	noneed, notmarker, loop

	or	cl,cl
	js	noneed
loop:
	movzx	eax,byte ptr [esi]
	mov	edx,000000101h
	add	edx,eax
	shr	edx,8
	add	esi,edx

	shl	eax,cl
	or	ebp,eax
	sub	cl,8
	jns	loop
noneed:
	endm

NEEDBITS	macro at
	local	noneed, notmarker, loop

	cmp	cl,24-at
	jle	noneed
loop:
	movzx	eax,byte ptr [esi]
	mov	edx,000000101h
	add	edx,eax
	shr	edx,8
	add	esi,edx

	shl	eax,cl
	or	ebp,eax
	sub	cl,8
	jns	loop
noneed:
	endm

NEEDBITS2	macro at
	local	done,short_ok,ick

	cmp	cl,24-at
	jle	done

	mov	eax,[esi-2]
	bswap	eax
	cmp	al,0ffh
	jz	ick
	cmp	ah,0ffh
	jnz	short_ok
ick:
	call	sucks2
short_ok:
	and	eax,0000ffffh
	sub	cl,8
	shl	eax,cl
	sub	cl,8
	or	ebp,eax
	add	esi,2
done:
	endm

ALIGN16	macro
	local	x
x = ((__start - $) and 15)
	rept	x
	nop
	endm
	endm

; decode_mbs(dword& bitbuf, int& bitcnt, byte *& ptr, int mcu_length, MJPEGBlockDef *pmbd, int **dctptrarray);

	extern	_zigzag:	dword

l_size		= 32
l_mb		= 0
l_huffac	= 4
l_dctptr	= 8
l_quantptr	= 12
l_zigzagptr	= 16
l_acquick	= 20
l_acquick2	= 24
l_bitcnt	= 28
l_save		= 29

p_bitbuf	= l_size + 4 + 28
p_bitcnt	= l_size + 8 + 28
p_ptr		= l_size + 12 + 28
p_mb_count	= l_size + 16 + 28
p_blocks	= l_size + 20 + 28
p_dctptrarray	= l_size + 24 + 28

block		struct
huff_dc		dd	?
huff_ac		dd	?
huff_ac_quick	dd	?
huff_ac_quick2	dd	?
quant		dd	?
dc_ptr		dd	?
ac_last		dd	?
block		ends

	public _asm_mb_decode

mmx00FF	dq	0000FFFF0000FFFFh

_asm_mb_decode:
	push	ebp
	push	edi
	push	esi
	push	edx
	push	ecx
	push	ebx
	push	eax

	sub	esp,l_size

	mov	dword ptr [esp + l_mb],0
	mov	ebp, [esp + p_bitbuf]
	mov	ecx, [esp + p_bitcnt]
	mov	esi, [esp + p_ptr]
	mov	ebp,[ebp]
	mov	ecx,[ecx]
	mov	esi,[esi]

mb_loop:
	mov	eax,[esp + p_blocks]

	mov	ebx,[eax].block.quant
	mov	[esp + l_quantptr],ebx
	mov	edx,[eax].block.huff_ac_quick
	mov	[esp + l_acquick],edx
	mov	edx,[eax].block.huff_ac_quick2
	mov	[esp + l_acquick2],edx
	mov	ebx,[eax].block.huff_ac
	mov	[esp + l_huffac],ebx

	push	eax
	mov	ebx,[esp + p_dctptrarray + 4]
	mov	eax,[ebx]
	mov	[esp + l_dctptr + 4],eax
	add	ebx,4
	mov	[esp + p_dctptrarray+ 4],ebx
	pop	eax

	mov	[esp + l_zigzagptr], offset _zigzag + 1

	FILLBITS

	;decode DC coefficient

	mov	[esp + l_bitcnt],cl

	mov	edi,[esp + l_dctptr]
	mov	ecx,32
	mov	eax,0
	rep	stosd

	mov	edx,ebp
	shr	edx,30
	shl	ebp,2
	xor	eax,eax
	xor	ebx,ebx
	mov	edi,[esp + p_blocks]
	mov	edi,[edi].block.huff_dc

DC_decode_loop:
	movzx	ecx,byte ptr [edi + eax + 1]
	sub	edx,ecx
	jc	DC_decode_loop_term
	add	ebp,ebp
	adc	edx,edx
	add	ebx,ecx
	inc	eax
	jmp	short DC_decode_loop

DC_decode_loop_term:
	add	edx,ecx
	mov	cl,[esp + l_bitcnt]
	add	cl,al
	add	cl,2
	
	add	ebx,edx
	jz	no_DC_difference

	mov	eax,ebx

	;sign-extend DC difference

	mov	ebx,ebp
	mov	dl,cl
	sar	ebx,31
	mov	cl,al
	xor	ebx,-1

	mov	eax,ebx
	shld	eax,ebp,cl
	shl	ebp,cl
	sub	eax,ebx
	add	cl,dl

	;DC difference is now in EAX

no_DC_difference:
	mov	ebx,[esp + p_blocks]
	mov	ebx,[ebx].block.dc_ptr
	mov	edi,[esp + l_quantptr]
	imul	eax,[edi]
	add	eax,[ebx]
	mov	[ebx],eax
	mov	edx,[esp + l_dctptr]
	mov	[edx+0],al
	mov	[edx+1],ah

	;***** BEGIN DECODING AC COEFFICIENTS

;	FILLBITS
	jmp	short AC_loop

	ALIGN16
AC_loop:
	NEEDBITS2 16
	
	cmp	ebp,0ff800000h
	jae	AC_long_decode
	cmp	ebp,0b0000000h
	jae	AC_medium_decode

	;table-based decode for short AC coefficients

	mov	ebx,ebp
	mov	edi,[esp + l_acquick]
	shr	ebx,25
	mov	dl,cl
	mov	cl,[edi + ebx*2 + 1]
	movsx	eax,byte ptr [edi + ebx*2]

	shl	ebp,cl
	add	cl,dl
	or	eax,eax
	jz	AC_exit

AC_decode_coefficient:

	;multiply coefficient by quant. matrix entry and store

	mov	ebx,[esp + l_zigzagptr]
	mov	edi,[esp + l_dctptr]
	inc	ebx
	mov	[esp + l_zigzagptr],ebx
	mov	edx,[esp + l_quantptr]
	movzx	ebx,byte ptr [ebx-1]
	imul	eax,[edx + ebx*4]
	mov	[edi + ebx*2 + 0],al
	mov	[edi + ebx*2 + 1],ah

	;end of AC coefficient loop

	cmp	ebx,63
	jne	AC_loop
	jmp	AC_exit

	ALIGN16
AC_medium_decode:
	mov	ebx,ebp
	mov	edi,[esp + l_acquick2]
	shr	ebx,20
	mov	dl,cl
	mov	cl,[edi + ebx*2 + 1 - 1600h]

	shl	ebp,cl
	add	cl,dl

	; parse out actual code

	NEEDBITS2 16

	movzx	ebx,byte ptr [edi + ebx*2 - 1600h]

	cmp	bl,0f0h
	jz	AC_skip16

AC_do_long:
	mov	eax,ebx
	and	ebx,15			;ebx = size bits

	shr	eax,4			;eax = skip
	mov	edi,[esp + l_zigzagptr]

	add	edi,eax
	mov	[esp + l_zigzagptr],edi

	mov	eax,ebp
	mov	dl,cl
	shr	eax,31
	mov	cl,bl
	dec	eax

	mov	ebx,eax
	shld	eax,ebp,cl
	shl	ebp,cl
	sub	eax,ebx
	add	cl,dl

	jmp	AC_decode_coefficient

	ALIGN16
AC_exit:
	mov	edx,[esp + l_zigzagptr]
	movzx	ebx,byte ptr [edx]

	;all done with this macroblock!

	mov	eax,[esp + p_blocks]
	mov	[eax].block.ac_last,ebx
	add	eax,sizeof block
	mov	[esp + p_blocks],eax

	mov	eax,[esp + l_mb]
	inc	eax
	cmp	eax,[esp + p_mb_count]
	mov	[esp + l_mb],eax
	jb	mb_loop

	;finish
fastexit:
	mov	eax,[esp + p_bitbuf]
	mov	ebx,[esp + p_bitcnt]
	mov	edx,[esp + p_ptr]

	and	ecx,000000ffh
	add	ecx,0ffffff80h
	xor	ecx,0ffffff80h

	mov	[eax], ebp
	mov	[ebx], ecx
	mov	[edx], esi

	add	esp,l_size
	pop	eax
	pop	ebx
	pop	ecx
	pop	edx
	pop	esi
	pop	edi
	pop	ebp

	ret





	;start long AC decode
	;
	;	16: 00-7F
	;	15: 00-3F
	;	14: 00-1F

	ALIGN16
AC_long_decode:
	mov	edx,ebp
	shr	edx,32-16
	mov	edi,[esp + l_huffac]
	movzx	ebx,byte ptr [edi+edx*2-0FF80h*2]
	mov	al,cl
	mov	cl,byte ptr [edi+edx*2-0FF80h*2+1]
	shl	ebp,cl
	add	cl,al

	NEEDBITS2	16

	jmp	AC_do_long

	ALIGN16
sucks2:
	movzx	eax,byte ptr [esi]
	mov	edx,1
	add	edx,eax
	shr	edx,8
	add	esi,edx

	shl	eax,8
	movzx	edx,byte ptr [esi+1]

	add	eax,edx
	inc	edx
	shr	edx,8
	add	esi,edx

	ret

	;reset 16 coefficients to zero

	ALIGN16
AC_skip16:
	mov	edx,[esp + l_zigzagptr]
	add	edx,16
	cmp	edx,offset _zigzag+64
	jae	AC_exit
	mov	[esp+l_zigzagptr],edx
	jmp	AC_loop

	end
