;  mvelibwa.c
;
; Interplay Movie (MVE) File Player Library (32-Bit Win95 Version)
; Assembly Language Components
; Written by Paul Allen Edelstein
;
; (c) 1997 Interplay Productions.  All Rights Reserved.
; This file is confidential and consists of proprietary information
; of Interplay Productions.  This file and associated libraries
; may not, in whole or in part, be disclosed to third parties,
; incorporated into any software product which is not being created
; for Interplay Productions, copied or duplicated in any form,
; without the prior written permission of Interplay Productions.
; Further, you may not reverse engineer, decompile or otherwise
; attempt to derive source code of this material.
;

;	.386
	.486	; I only need .386, but I wanted the 486 cycle timings
ifdef SYMANTEC
	.MODEL SMALL, C
DGROUP	group _TEXT, _DATA
else
	.MODEL FLAT, C
endif

;;--- Options ---

ONLYNEW equ 0			; For debug, disables motion comp
LOGGING equ 0			; Log timing statistics
PARTIAL equ 1			; Support for partial updates
PKDATA equ 1			; Support for packed data
HICOLOR equ 1			; Support for HiColor
INTERP equ 0			; Interpolated squares
				; 0:none (4x4x8), 1:generic dither,
				; 2:direction dither, 3:blend
COMPOPS equ 1			; Compressed opcode table
SCALING equ 1			; Scaling support
DECOMPD equ 0			; Support for dithered half vert res
TRANS16 equ 1			; Support for translating 16-bit rgb format

;;--- Types ---

PTRBYTE TYPEDEF PTR BYTE
PTRWORD TYPEDEF PTR WORD
PTRDWORD TYPEDEF PTR DWORD
PTRPROC TYPEDEF PTR PROC

;;--- Constants ---

; Width and height of sections in pixels.
SWIDTH equ 8
SHEIGHT equ 8

LOG2_SWIDTH equ 3
LOG2_SHEIGHT equ 3

;;---

EXTERN pal_tbl:BYTE		; unsigned char pal_tbl[3*256];
EXTERN pal15_tbl:WORD	; unsigned short pal15_tbl[256];
if INTERP eq 3
EXTERN blend_tbl: PTRDWORD		; unsigned *blend_tbl;
endif
	.data

	BYTE "(c) 1997 Interplay Productions.  All Rights Reserved.\n"
	BYTE "This file is confidential and consists of proprietary information\n"
	BYTE "of Interplay Productions.  This file and associated libraries\n"
	BYTE "may not, in whole or in part, be disclosed to third parties,\n"
	BYTE "incorporated into any software product which is not being created\n"
	BYTE "for Interplay Productions, copied or duplicated in any form,\n"
	BYTE "without the prior written permission of Interplay Productions.\n"
	BYTE "Further, you may not reverse engineer, decompile or otherwise\n"
	BYTE "attempt to derive source code of this material.\n",0

	.code

PUBLIC	mveliba_start, mveliba_end

mveliba_start:

;----------------------------------------------------------------------
; Logging Support
;-----------------

if LOGGING

;void logLabel(char *label)
;
logLabel PROTO lbl:PTRBYTE

LOG_LABEL MACRO msg
LOCAL lbl
	.data
lbl	BYTE msg,0
	.code
	INVOKE logLabel, offset lbl
	ENDM

else

LOG_LABEL MACRO msg
	ENDM

endif

;--------------------------------------------------------------------
; Sound Management
;--------------------

EXTERN snd_8to16: WORD		; short snd_8to16[256];

;unsigned sndDecompM16(unsigned short *dst, unsigned char *src,
;			unsigned len, unsigned prev);
;
;Decompresses a mono stream containing len samples
;(src is len bytes, dst is len*2 bytes)
;prev is the previous decompression state or zero.
;Returns new decompression state.
;
sndDecompM16 PROC USES ESI EDI EBX, \
		dst:PTRWORD, src:PTRBYTE, len:DWORD, prev:DWORD
	mov	eax, prev

	mov	ecx, len
	jecxz	done

	mov	esi, src
	mov	edi, dst

	xor	ebx, ebx

lp:	mov	bl, byte ptr [esi]
	add	esi, 1
	add	ax, word ptr snd_8to16[ebx*2]
	mov	word ptr [edi], ax
	add	edi, 2
	dec	ecx
	jnz	lp

done:	ret
sndDecompM16 ENDP

;unsigned sndDecompS16(unsigned short *dst, unsigned char *src,
;			unsigned len, unsigned prev);
;
;Decompresses a stereo stream containing len samples
;(src is len*2 bytes, dst is len*4 bytes)
;prev is the previous decompression state or zero
; (It encodes the 16-bit states of the two stereo channels
;  in its low and high order 16-bit halves.)
;Returns new decompression state.
;
sndDecompS16 PROC USES ESI EDI EBX, \
		dst:PTRWORD, src:PTRBYTE, len:DWORD, prev:DWORD
	movzx	eax, word ptr prev
	movzx	edx, word ptr prev+2

	mov	ecx, len
	jecxz	done

	mov	esi, src
	mov	edi, dst

	xor	ebx, ebx

lp:	mov	bl, byte ptr [esi]
	add	esi, 1
	add	ax, word ptr snd_8to16[ebx*2]
	mov	word ptr [edi], ax
	add	edi, 2

	mov	bl, byte ptr [esi]
	add	esi, 1
	add	dx, word ptr snd_8to16[ebx*2]
	mov	word ptr [edi], dx
	add	edi, 2

	dec	ecx
	jnz	lp

done:	shl	edx, 16
	or	eax, edx
	ret
sndDecompS16 ENDP

;--------------------------------------------------------------------
; NextFrame (Video Decompression)
;----------------------------------

;; NextFrame working storage
				; MemRec nf_mem_buf1;
				; MemRec nf_mem_buf2;
EXTERN nf_buf_cur: PTRBYTE	; unsigned char* nf_buf_cur;
EXTERN nf_buf_prv: PTRBYTE	; unsigned char* nf_buf_prv;

;; NextFrame parameters
EXTERN nf_wqty: BYTE		;unsigned char nf_wqty;	 // (width/SWIDTH)
EXTERN nf_hqty: BYTE		;unsigned char nf_hqty;	 // (height/SHEIGHT)
EXTERN nf_fqty: BYTE		;unsigned char nf_fqty;	 // Number of fields
if HICOLOR
EXTERN nf_hicolor: DWORD	;unsigned nf_hicolor;	 // HiColor (0:none,1:normal,2:swapped)
endif
;;  <derived quantities>
EXTERN nf_width: DWORD		;unsigned nf_width;	 // wqty * SWIDTH
EXTERN nf_height: DWORD		;unsigned nf_height;	 // hqty * SHEIGHT;
EXTERN nf_new_line: DWORD	;unsigned nf_new_line;	 // width - SWIDTH
EXTERN nf_new_row0: DWORD	;unsigned nf_new_row0;	 // SHEIGHT*width*2-width
EXTERN nf_back_right: DWORD	;unsigned nf_back_right; // (SHEIGHT-1)*width

;; Frame parameters
;;  Portion of current frame which has been updated
;;  and needs to be sent to screen.
;;
EXTERN nf_new_x: DWORD		;unsigned nf_new_x;
EXTERN nf_new_y: DWORD		;unsigned nf_new_y;
EXTERN nf_new_w: DWORD		;unsigned nf_new_w;
EXTERN nf_new_h: DWORD		;unsigned nf_new_h;


NF_DECOMP_INIT MACRO HI_COLOR_FLAG: REQ

	mov	ax, ds			; Insure es==ds for symantec flat mode
	mov	es, ax

	mov	eax, nf_buf_prv		; DiffBufPtrs = nf_buf_prv - nf_buf_cur
	sub	eax, nf_buf_cur
	mov	DiffBufPtrs, eax

	xor	ebx, ebx		; ebx = nf_fqty (convert to 32-bits)
	mov	bl, nf_fqty

	mov	eax, x			; nf_new_x = x*SWIDTH*2^HI_COLOR_FLAG;
	shl	eax, LOG2_SWIDTH+HI_COLOR_FLAG
	mov	nf_new_x, eax

	mov	eax, w			; nf_new_w = w*SWIDTH*2^HI_COLOR_FLAG;
	shl	eax, LOG2_SWIDTH+HI_COLOR_FLAG
	mov	nf_new_w, eax

	mov	eax, y			; nf_new_y = y*nf_fqty*SHEIGHT;
	shl	eax, LOG2_SHEIGHT
	mul	ebx	;nf_fqty
	mov	nf_new_y, eax

	mov	eax, h			; nf_new_h = h*nf_fqty*SHEIGHT;
	shl	eax, LOG2_SHEIGHT
	mul	ebx	;nf_fqty
	mov	nf_new_h, eax

	mov	eax, nf_new_row0	; new_row = nf_new_row0 - nf_new_w;
	sub	eax, nf_new_w
	mov	new_row, eax

	;; Move to correct place in current buffer
	mov	eax, nf_buf_cur		; tbuf = nf_buf_cur
	mov	tbuf, eax
	.if x || y			; if (x||y)
	  mov	eax, nf_new_y		;  tbuf += nf_new_y*nf_width + nf_new_x;
	  mul	nf_width
	  add	eax, nf_new_x
	  add	tbuf, eax
	.endif

	ENDM	; DECOMP_INIT


DECOMP_BODY MACRO HI_COLOR_FLAG:REQ

	LOCAL HI_COLOR_SCALE
HI_COLOR_SCALE equ HI_COLOR_FLAG+1

	NF_DECOMP_INIT HI_COLOR_FLAG

	mov	eax, w			; parms_sz = (w*h*nf_fqty)<<1
	mul	h
	mul	ebx	;nf_fqty
	shl	eax, 1
	mov	parms_sz, eax

	; esi indexes comp (to get new section data)
	; edi indexes current screen buffer
	; edx is a frequently used constant
	; ebx indexes section params
	mov	edi, tbuf
	mov	edx, nf_new_line	; width - SWIDTH
	mov	ebx, comp		; Parms index
	mov	esi, ebx
	add	esi, parms_sz		; Skip over flags (w*h*2)

	; Iterate over params and copy new hires data to appropriate sections.
	mov	cl, nf_fqty
ns_0f:	push	ecx
	push	edi
	mov	ch, byte ptr h
ns_0:	mov	cl, byte ptr w
ns_1:	cmp	word ptr [ebx],0
	je	ns_10
	add	edi, SWIDTH*HI_COLOR_SCALE
ns_2:	add	ebx, 2
	dec	cl
	jnz	ns_1
	add	edi, new_row		; SHEIGHT*width - SWIDTH*w
	dec	ch
	jnz	ns_0
	pop	edi
	pop	ecx
	add	edi, nf_width
	dec	cl
	jnz	ns_0f
	jmp	ns_99

	; Copy new data to one section
	; Enter with esi pointing to source data, edi to screen section.

	; Assumes SWIDTH=8 (16-bit data) and SHEIGHT=8
ns_10:
	REPEAT 7
	  REPEAT 2*HI_COLOR_SCALE
	    movsd
	  ENDM
	  add	edi, edx
	ENDM
	REPEAT 2*HI_COLOR_SCALE
	    movsd
	ENDM

	sub	edi, nf_back_right	; (SHEIGHT-1)*width
	jmp	ns_2

ns_99:

ife ONLYNEW	; if !ONLYNEW
	; Iterate over flags and motion source addresses from params
	;  to determine which sections to move.
	; ebx indexes params.
	; esi indexes source from buffer
	;   esi will be computed as +- 16K relative to edi.

	sub	ebx, parms_sz	; Move back to start of section parms
	mov	edi, tbuf
	mov	cl, nf_fqty
	xor	esi, esi
ms_0f:	push	ecx
	push	edi
	mov	ch, byte ptr h
ms_0:	mov	cl, byte ptr w
ms_1:	or	si, [ebx]
	jg	ms_10
	jl	ms_j30
	add	edi, SWIDTH*HI_COLOR_SCALE
ms_2:	add	ebx, 2
	dec	cl
	jnz	ms_1
	add	edi, new_row	; SHEIGHT*width - SWIDTH*w
	dec	ch
	jnz	ms_0
	pop	edi
	pop	ecx
	add	edi, nf_width
	dec	cl
	jnz	ms_0f
	jmp	ms_99

ms_j30:	jmp	ms_30

	; Move one section from current screen to current screen.
	; Enter with
	;  edi pointing to destination screen section,
	;  relative value of source offset in esi.

	; The following assumes SWIDTH==8 and SHEIGHT==8

ms_10:	; Make esi absolute
	lea	esi, [esi*HI_COLOR_SCALE-04000h*HI_COLOR_SCALE+edi]
	REPEAT 7
	  REPEAT 2*HI_COLOR_SCALE
	    movsd
	  ENDM
	  add	esi, edx
	  add	edi, edx
	ENDM
	REPEAT 2*HI_COLOR_SCALE
	    movsd
	ENDM

	sub	edi, nf_back_right	; (SHEIGHT-1)*width
	xor	esi, esi		; Reset esi to zero
	jmp	ms_2


ms_20f:	push	ecx
	push	edi
	mov	ch, byte ptr h
ms_20:	mov	cl, byte ptr w
ms_21:	or	si, [ebx]
	jl	ms_30
	jg	ms_j10
	add	edi, SWIDTH*HI_COLOR_SCALE
ms_22:	add	ebx, 2
	dec	cl
	jnz	ms_21
	add	edi, new_row		; SHEIGHT*width - SWIDTH*w
	dec	ch
	jnz	ms_20
	pop	edi
	pop	ecx
	add	edi, nf_width
	dec	cl
	jnz	ms_20f
	jmp	ms_99

ms_j10:	jmp	ms_10

	; Move one section from previous screen to current screen.
	; Enter with
	;  edi pointing to destination screen section,
	;  relative value of source offset in esi.

	; The following assumes SWIDTH==8 and SHEIGHT==8

ms_30:	; Make esi absolute
	lea	esi, [esi*HI_COLOR_SCALE-0C000h*HI_COLOR_SCALE+edi]
	add	esi, DiffBufPtrs		;  and point to other buffer

	REPEAT 7
	  REPEAT 2*HI_COLOR_SCALE
	    movsd
	  ENDM
	  add	esi, edx
	  add	edi, edx
	ENDM
	REPEAT 2*HI_COLOR_SCALE
	    movsd
	ENDM

	sub	edi, nf_back_right	; (SHEIGHT-1)*width
	xor	esi, esi		; Reset esi to zero
	jmp	ms_22

ms_99:
endif		; #endif !ONLYNEW
	ENDM		; DECOMP_BODY

if PARTIAL

DECOMP_CHG_BODY MACRO HI_COLOR_FLAG:REQ

	LOCAL HI_COLOR_SCALE
HI_COLOR_SCALE equ HI_COLOR_FLAG+1

	NF_DECOMP_INIT HI_COLOR_FLAG

	; esi indexes comp (to get new section data)
	; edi indexes current screen buffer
	; edx is a frequently used constant
	; ebx indexes section params
	mov	edi, tbuf
	mov	edx, nf_new_line	; width - SWIDTH
	mov	esi, comp
	mov	ebx, parms

	; Iterate over params and copy new hires data to appropriate sections.

	mov	eax, chgs
	mov	pChgs, eax
	mov	eax, 0
	mov	cl, nf_fqty
ns_0f:	push	ecx
	push	edi
	mov	ch, byte ptr h
ns_0:	mov	cl, byte ptr w
ns_1:	add	ax, ax
	ja	ns_1b
	jz	ns_5
	cmp	word ptr [ebx],0
	je	ns_10
	add	ebx, 2
ns_1b:	add	edi, SWIDTH*HI_COLOR_SCALE
ns_2:	dec	cl
	jnz	ns_1
	add	edi, new_row	; SHEIGHT*width - SWIDTH*w
	dec	ch
	jnz	ns_0
	pop	edi
	pop	ecx
	add	edi, nf_width
	dec	cl
	jnz	ns_0f
	jmp	ns_99

ns_5:	mov	eax, pChgs
	add	pChgs, 2
	mov	ax, [eax]
	jmp	ns_1

	; Copy new data to one section
	; Enter with ds:si pointing to source data, es:di to screen section.

	; Assumes SWIDTH=8 (16-bit data) and SHEIGHT=8
ns_10:
	REPEAT 7
	  REPEAT 2*HI_COLOR_SCALE
	    movsd
	  ENDM
	  add	edi, edx
	ENDM
	REPEAT 2*HI_COLOR_SCALE
	    movsd
	ENDM

	sub	edi, nf_back_right	; (SHEIGHT-1)*width
	add	ebx, 2
	jmp	ns_2

ns_99:

ife ONLYNEW	; if !ONLYNEW
	; Iterate over flags and motion source addresses from params
	;  to determine which sections to move.
	; ebx indexes params.
	; esi indexes source from buffer
	;   esi will be computed as +- 16K relative to edi.

	mov	edi, tbuf
	mov	ebx, parms

	mov	eax, chgs
	mov	pChgs, eax
	mov	eax, 0
	mov	cl, byte ptr nf_fqty
	xor	esi, esi
ms_0f:	push	ecx
	push	edi
	mov	ch, byte ptr h
ms_0:	mov	cl, byte ptr w
ms_1:	add	ax, ax
	ja	ms_1b
	jz	ms_5
	or	si, [ebx]
	jg	ms_10
	jl	ms_j30
	add	ebx, 2
ms_1b:	add	edi, SWIDTH*HI_COLOR_SCALE
ms_2:	dec	cl
	jnz	ms_1
	add	edi, new_row		; SHEIGHT*width - SWIDTH*w
	dec	ch
	jnz	ms_0
	pop	edi
	pop	ecx
	add	edi, nf_width
	dec	cl
	jnz	ms_0f
	jmp	ms_99

ms_5:	mov	eax, pChgs
	add	pChgs, 2
	mov	ax, word ptr [eax]
	jmp	ms_1
	

ms_j30:	jmp	ms_30

	; Move one section from current screen to current screen.
	; Enter with
	;  edi pointing to destination screen section,
	;  relative value of source offset in esi.

	; The following assumes SWIDTH==8 and SHEIGHT==8

ms_10:	; Make esi absolute
	lea	esi, [esi*HI_COLOR_SCALE-04000h*HI_COLOR_SCALE+edi]
	REPEAT 7
	  REPEAT 2*HI_COLOR_SCALE
	    movsd
	  ENDM
	  add	esi, edx
	  add	edi, edx
	ENDM
	REPEAT 2*HI_COLOR_SCALE
	    movsd
	ENDM

	sub	edi, nf_back_right	; (SHEIGHT-1)*width
	xor	esi, esi		; Reset esi to zero
	add	ebx, 2
	jmp	ms_2


ms_20f:	push	ecx
	push	edi
	mov	ch, byte ptr h
ms_20:	mov	cl, byte ptr w
ms_21:	add	ax, ax
	ja	ms_21b
	jz	ms_25
	or	si, [ebx]
	jl	ms_30
	jg	ms_j10
	add	ebx, 2
ms_21b:	add	edi, SWIDTH*HI_COLOR_SCALE
ms_22:	dec	cl
	jnz	ms_21
	add	edi, new_row		; SHEIGHT*width - SWIDTH*w
	dec	ch
	jnz	ms_20
	pop	edi
	pop	ecx
	add	edi, nf_width
	dec	cl
	jnz	ms_20f
	jmp	ms_99

ms_25:	mov	eax, pChgs
	add	pChgs, 2
	mov	ax, [eax]
	jmp	ms_21

ms_j10:	jmp	ms_10

	; Move one section from previous screen to current screen.
	; Enter with
	;  edi pointing to destination screen section,
	;  relative value of source offset in esi.

	; The following assumes SWIDTH==8 and SHEIGHT==8

ms_30:	; Make esi absolute
	lea	esi, [esi*HI_COLOR_SCALE-0C000h*HI_COLOR_SCALE+edi]
	add	esi, DiffBufPtrs		;  and point to other buffer

	REPEAT 7
	  REPEAT 2*HI_COLOR_SCALE
	    movsd
	  ENDM
	  add	esi, edx
	  add	edi, edx
	ENDM
	REPEAT 2*HI_COLOR_SCALE
	    movsd
	ENDM

	sub	edi, nf_back_right	; (SHEIGHT-1)*width
	add	ebx, 2
	xor	esi, esi		; Reset esi to zero
	jmp	ms_22

ms_99:
endif ; !ONLYNEW

	ENDM	; DECOMP_CHG_BODY

endif ; PARTIAL

;;--- HiColor versions

if HICOLOR

; Decompress into subsection of current buffer specified
;  by x,y,w,h in units of SWIDTHxSHEIGHT (8x8).
;

;void
;nfHiColorDecomp(unsigned char *comp,
;		 unsigned x, unsigned y, unsigned w, unsigned h)
;
nfHiColorDecomp PROC USES ESI EDI EBX, \
		comp:PTRBYTE, \
		x:DWORD, y:DWORD, w:DWORD, h:DWORD
	LOCAL tbuf: PTRBYTE
	LOCAL new_row: DWORD
	LOCAL DiffBufPtrs: DWORD

	LOCAL parms_sz: DWORD
		     
	LOG_LABEL "StartHiColorDecomp"
	DECOMP_BODY 1 ; HiColor
	LOG_LABEL "EndHiColorDecomp"

	ret
nfHiColorDecomp ENDP

if PARTIAL

; Decompress into subsection of current buffer specified
;  by x,y,w,h in units of SWIDTHxSHEIGHT (8x8).
; Chgs specifies which squares to update.
; Parms are motion parms for squares to update.
;
;void
;nfHiColorDecompChg(unsigned short *chgs,
;			unsigned short *parms,
;			unsigned char *comp,
;		        unsigned x, unsigned y, unsigned w, unsigned h)
;
nfHiColorDecompChg PROC USES ESI EDI EBX, \
			chgs:PTRWORD, \
			parms:PTRWORD, \
			comp:PTRBYTE, \
			x:DWORD, y:DWORD, w:DWORD, h:DWORD
	LOCAL tbuf: PTRBYTE
	LOCAL new_row: DWORD
	LOCAL DiffBufPtrs: DWORD

	LOCAL pChgs: PTRBYTE

	LOG_LABEL "StartHiColorDecompChg"
	DECOMP_CHG_BODY 1 ; HiColor
	LOG_LABEL "EndHiColorDecompChg"
	ret
nfHiColorDecompChg ENDP


endif ; PARTIAL


endif ; HICOLOR

; Non-HiColor versions

; Decompress into subsection of current buffer specified
;  by x,y,w,h in units of SWIDTHxSHEIGHT (8x8).
;
;void nfDecomp(unsigned char *comp,
;	       unsigned x, unsigned y, unsigned w, unsigned h)
;
nfDecomp PROC USES ESI EDI EBX, \
		comp:PTRBYTE, \
		x:DWORD, y:DWORD, w:DWORD, h:DWORD
	LOCAL tbuf: PTRBYTE
	LOCAL new_row: DWORD
	LOCAL DiffBufPtrs: DWORD

	LOCAL parms_sz: DWORD
		     
if HICOLOR
	.if nf_hicolor
	  INVOKE nfHiColorDecomp, comp,x,y,w,h
	  ret
	.endif
endif

	LOG_LABEL "StartDecomp"
	DECOMP_BODY 0 ; Not HiColor
	LOG_LABEL "EndDecomp"

	ret
nfDecomp ENDP

if PARTIAL

; Decompress into subsection of current buffer specified
;  by x,y,w,h in units of SWIDTHxSHEIGHT (8x8).
; Chgs specifies which squares to update.
; Parms are motion parms for squares to update.
;
;void
;nfDecompChg(unsigned short *chgs,
;			unsigned short *parms,
;			unsigned char *comp,
;		        unsigned x, unsigned y, unsigned w, unsigned h)
;
nfDecompChg PROC USES ESI EDI EBX, \
			chgs:PTRWORD, \
			parms:PTRWORD, \
			comp:PTRBYTE, \
			x:DWORD, y:DWORD, w:DWORD, h:DWORD
	LOCAL tbuf: PTRBYTE
	LOCAL new_row: DWORD
	LOCAL DiffBufPtrs: DWORD

	LOCAL pChgs: PTRBYTE

if HICOLOR
	.if nf_hicolor
	  INVOKE nfHiColorDecompChg, chgs,parms,comp,x,y,w,h
	  ret
	.endif
endif

	LOG_LABEL "StartDecompChg"
	DECOMP_CHG_BODY 0 ; Not HiColor
	LOG_LABEL "EndDecompChg"

	ret
nfDecompChg ENDP


endif ; PARTIAL

;----------------------------------------------------------------------

if PKDATA
	.data

if (INTERP eq 1) or (INTERP eq 2) ; *** Old version for dithering ***
; luminace table for palette entries
lum_tbl		DWORD 256 DUP (0)
endif

; signed 8-bit y * nf_width
nfpk_ShiftY	DWORD 256 DUP (0)

; Constant tables

; 8-bit -8:7 x nf_width +  -8:7
nfpk_ShiftP1	LABEL WORD
FOR y, <-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7>
  FOR x, <-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7>
		BYTE x,y
  ENDM
ENDM

; 8-bit to right and below in roughly 0:14*nf_width + -14:14 (-3 cases)
; negative is
;  8-bit to left and above in roughly -14:0*nf_width + -14:14 (-3 cases)
nfpk_ShiftP2	LABEL WORD
FOR y, <0,1,2,3,4,5,6,7>
  FOR x, <8,9,10,11,12,13,14>
		BYTE x,y
  ENDM
ENDM
FOR y, <8,9,10,11,12,13>
  FOR x, <-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1>
		BYTE x,y
  ENDM
  FOR x, <0,1,2,3,4,5,6,7,8,9,10,11,12,13,14>
		BYTE x,y
  ENDM
ENDM
FOR x, <-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1>
		BYTE x,14
ENDM
FOR x, <0,1,2,3,4,5,6,7,8,9,10,11>
		BYTE x,14
ENDM

nfpk_mov4l	LABEL DWORD
;		mov ax, bx,cx
MOV4L_REGS	TEXTEQU <!<0c0h+3,0c0h+1!>>
%FOR m4, MOV4L_REGS
% FOR m3, MOV4L_REGS
%  FOR m2, MOV4L_REGS
%   FOR m1, MOV4L_REGS
	BYTE m2,m1,m4,m3
    ENDM
   ENDM
  ENDM
 ENDM

nfpk_mov8	LABEL DWORD
;		mov ax,	bx/dx/cx/bp
MOV8_REGS	TEXTEQU <!<0c0h+3,0c0h+2,0c0h+1,0c0h+5!>>
%FOR m4, MOV8_REGS
% FOR m3, MOV8_REGS
%   FOR m2, MOV8_REGS
%     FOR m1, MOV8_REGS
	BYTE m2,m1,m4,m3
      ENDM
    ENDM
  ENDM
 ENDM

nfpk_mov4	LABEL DWORD
;		mov al, bl/bh/cl/ch
MOV4_REGS0	TEXTEQU <!<0c0h+3,0c0h+7,0c0h+1,0c0h+5!>>
;		mov ah, bl/bh/cl/ch
MOV4_REGS1	TEXTEQU <!<0e0h+3,0e0h+7,0e0h+1,0e0h+5!>>
%FOR m4, MOV4_REGS1
% FOR m3, MOV4_REGS0
%   FOR m2, MOV4_REGS1
%     FOR m1, MOV4_REGS0
	BYTE m3,m4,m1,m2
      ENDM
    ENDM
  ENDM
 ENDM

	.code

; nfPkConfig initializes tables used by nfPkDecomp
;  which are dependent on screen size.
nfPkConfig PROC USES ESI EDI EBX

	; Build ShiftY table
	;
	lea	edi, nfpk_ShiftY
	mov	ebx, nf_width

	mov	eax, 0
	mov	ecx, 128
lp1:	mov	[edi], eax
	add	edi,4
	add	eax,ebx
	dec	ecx
	jne	lp1

	mov	eax, ebx
	shl	eax, 7
	neg	eax
	mov	ecx, 128
lp2:	mov	[edi], eax
	add	edi,4
	add	eax,ebx
	dec	ecx
	jne	lp2

	ret
nfPkConfig ENDP

if (INTERP eq 1) or (INTERP eq 2)
; nfPkPal initializes tables used by nfPkDecomp
;  which are dependent on palette.
nfPkPal PROC USES ESI EDI EBX

	; Build palette luminance table
	;
	lea	esi, pal_tbl
	lea	edi, lum_tbl
	mov	ecx, 256
lp3:	xor	eax, eax
	xor	ebx, ebx
	xor	edx, edx
	mov	al, [esi]	; r
	mov	bl, [esi+1]	; g
	mov	dl, [esi+2]	; b
	add	esi, 3
	imul	eax, 2990
	imul	ebx, 5866
	imul	edx, 1144
	add	eax, ebx
	add	eax, edx
	mov	[edi], eax
	add	edi, 4
	dec	ecx
	jnz	lp3	

	ret
nfPkPal ENDP

elseif INTERP eq 3

nfPkInterp1 MACRO left:REQ, right:REQ
	xor	eax, eax
	mov	al, left
	mov	ah, right
	mov	eax, [esi+eax*4]
	mov	edx, eax
	mov	dl, dh
	shl	edx, 8
	mov	dl, left
	mov	[edi], edx
	mov	ah, right
	ror	eax, 16
	mov	[edi+4], eax
	ENDM

nfPkInterp2 MACRO left:REQ, right:REQ
	xor	eax, eax
	mov	al, left
	mov	ah, right
	mov	eax, [esi+eax*4]
	mov	edx, eax
	mov	dl, dh
	shl	edx, 8
	mov	dl, left
	mov	[edi], edx
	mov	[edi+ebp*1], edx
	mov	ah, right
	ror	eax, 16
	mov	[edi+4], eax
	mov	[edi+4+ebp*1], eax
	ENDM

endif

ifdef SYMANTEC
EXTERN _data_bottom:PTRBYTE
endif

; Normal version
;
nfPkDecomp PROC USES ESI EDI EBX, \
		ops:PTRBYTE, comp:PTRBYTE, \
		x:DWORD, y:DWORD, w:DWORD, h:DWORD
	LOCAL tbuf: PTRBYTE
	LOCAL new_row:DWORD
	LOCAL DiffBufPtrs:DWORD

	LOCAL nfpk_back_right: DWORD
	LOCAL wcnt:DWORD

	LOG_LABEL "StartPkDecomp"

.data
nfpk_OpTbl label dword
	dword offset nf0	; Prev Same (0)
	dword offset nf1	; No change (and copied to screen) (0)
	dword offset nf2	; Near shift from older part of current buf (1)
	dword offset nf3	; Near shift from newer part of current buf (1)
	dword offset nf4	; Near shift from previous buffer (1)
	dword offset nf5	; Far shift from previous buffer (2)
	dword offset nf6	; Far shift from current buffer (2)
				; [Or if COMPOPS, run of no changes (0)]
	dword offset nf7	;     8x8x1 (10 bytes) or low 4x4x1 (4 bytes)
	dword offset nf8	; 2x2 4x4x1 (16 bytes) or 2x1 4x8x1 (12 bytes) or 1x2 8x4x1 (12 bytes)
	dword offset nf9	;     8x8x2 (20 bytes) or low 4x4x2 (8 bytes) or
				; low 4x8x2 (12 bytes) or low 8x4x2 (12 bytes)
	dword offset nf10	; 2x2 4x4x2 (32 bytes) or 2x1 4x8x2 (24 bytes) or 1x2 4x8x2 (24 bytes)
	dword offset nf11	;     8x8x8 (64 bytes)
	dword offset nf12	; low 4x4x8 (16 bytes)
	dword offset nf13	; 2x2 4x4x0 (ie 2x2x8) (4 bytes)
	dword offset nf14	;     8x8x0 (1 byte)
	dword offset nf15	; mix 8x8x0 (2 bytes)
.code

ifdef SYMANTEC
	mov	ebx, ds		; Allow DS to access code
	mov	ecx, 0
	mov	ax, 3505h
	int 21h
endif

	NF_DECOMP_INIT	0

	mov	eax, nf_back_right
	sub	eax, SWIDTH
	mov	nfpk_back_right, eax

	mov	esi, comp
	mov	edi, tbuf
nf_StartRow:
	mov	eax, w
	shr	eax, 1
	mov	wcnt,eax
	ALIGN 4
nf_NextPair:
	dec	wcnt
	js	nf_NextRow
	mov	ebx, ops
	mov	al, [ebx]
	inc	ebx
	mov	ops, ebx

	xor	ebx, ebx
	mov	bl, al
	shr	bl, 4
	and	eax, 0Fh
	push	offset nf_NextPair
	push	nfpk_OpTbl[ebx*4]
	jmp	nfpk_OpTbl[eax*4]

nf_NextRow:
	add	edi, new_row
	dec	h
	jnz	nf_StartRow
	LOG_LABEL "EndPkDecomp"

ifdef SYMANTEC
	mov	ebx, ds		; Disable DS from accessing code
	mov	ecx, offset DGROUP:_data_bottom[-1]
	mov	ax, 3505h
	int 21h
endif
	ret

;----------------------------------------
	ALIGN 4
if INTERP eq 0

nf0:	; No change from previous buffer
	mov	eax, DiffBufPtrs
	jmp	nf_shift

elseif INTERP eq 3
nf0:	; Interpolated (1 byte)
	push	ebp

	mov	ebp, nf_width
	sub	edi, ebp		; Get four corner colors
	mov	bl, [edi-1]		;  into bl,bh,cl,ch
	mov	bh, [edi+7]
	mov	cl, [edi+ebp*8-1]
	mov	ch, [esi]
	inc	esi
	add	edi, ebp

	push	esi
	mov	esi, blend_tbl

	nfPkInterp1 bl,bh
	add	edi, ebp
	push	ebx
	push	ecx
	xor	eax, eax
	mov	al, bl
	mov	ah, cl
	mov	edx, [esi+eax*4]
	mov	al, bh
	mov	ah, ch
	mov	ecx, [esi+eax*4]
	mov	ebx, edx
	nfPkInterp2 bh,ch
	lea	edi, [edi+ebp*2]
	ror	ebx, 16
	ror	ecx, 16
	nfPkInterp2 bl,cl
	lea	edi, [edi+ebp*2]
	nfPkInterp2 bh,ch
	lea	edi, [edi+ebp*2]
	pop	ecx
	pop	ebx
	nfPkInterp1 cl,ch

	pop	esi
	pop	ebp

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

elseif INTERP eq 2

nf0:	; Interpolated (1 byte)
	mov	edx, nf_width
	sub	edi, edx		; Get four corner colors
	sub	edi, edx ;xxx
	mov	bl, [edi-1]		;  into bl,bh,cl,ch
	mov	bh, [edi+7]
	mov	cl, [edi+edx*8-1]
	mov	ch, [esi]
	inc	esi
	add	edi, edx ;xxx
	add	edi, edx

; Get four luminances into eax, ebx, ebp, ecx
; Use edx for temp, esi for closest luminance, edi for closest pair
	push	ebx
	push	ecx
	push	esi
	push	edi
	push	ebp

	xor	edx, edx
	mov	dl, bl
	mov	eax, lum_tbl[edx*4]
	mov	dl, bh
	mov	ebx, lum_tbl[edx*4]
	mov	dl, cl
	mov	ebp, lum_tbl[edx*4]
	mov	dl, ch
	mov	ecx, lum_tbl[edx*4]

	mov	edx, eax
	sub	edx, ebx
	jns	nf0a
	neg	edx
nf0a:	mov	esi, edx
	mov	edi, 0		; Vert

	mov	edx, eax
	sub	edx, ebp
	jns	nf0b
	neg	edx
nf0b:	cmp	edx, esi
	ja	nf0c
	mov	esi, edx
	mov	edi, 1		; Horiz

nf0c:	mov	edx, eax
	sub	edx, ecx
	jns	nf0d
	neg	edx
nf0d:	cmp	edx, esi
	ja	nf0e
	mov	esi, edx
	mov	edi, 2		; \ Diag

nf0e:	mov	edx, ebx
	sub	edx, ebp
	jns	nf0f
	neg	edx
nf0f:	cmp	edx, esi
	ja	nf0g
	mov	esi, edx	; / RDiag
	mov	edi, 3

nf0g:
	mov	edx, ebx
	sub	edx, ecx
	jns	nf0h
	neg	edx
nf0h:	cmp	edx, esi
	ja	nf0i
	mov	esi, edx
	mov	edi, 1		; Horiz

nf0i:	mov	edx, ebp
	sub	edx, ecx
	jns	nf0j
	neg	edx
nf0j:	cmp	edx, esi
	ja	nf0k
	mov	edi, 0

nf0k:	mov	eax, edi
	pop	ebp
	pop	edi
	pop	esi
	pop	ecx
	pop	ebx
	mov	edx, nf_width

	cmp	eax, 2
	jae	nfdiag
	or	eax, eax
	jz	nf0_v
	jmp	nf0_h

nfdiag:	jz	nf0_d
	jmp	nf0_r

if 1 ; Newer versions of Vertical and Horizontal blend that use 0%,25%,50%,75%,100% instead of just 0%,50%,100%

; Vertical blend
; 0        1
;   01010101	1
;   00121013	2
;   02010311	3
;   20203131	4
;   02021313	5
;   23202331	6
;   20332123	7
;2  22233233	8
;

nf0_v:
				; 3412  (low to high)
				;------
	mov	al, bl		; 0101	(1)
	mov	ah, bh
	shl	eax, 16
	mov	al, bl
	mov	ah, bh
	mov	[edi], eax
	mov	[edi+4], eax	; 0101
	add	edi, edx

	mov	al, bh		; 0012 (2)
	mov	ah, cl
	shl	eax, 8
	mov	al, bl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 1013
	mov	ah, ch
	shl	eax, 16
	mov	al, bh
	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

	mov	al, bl		; 0201 (3)
	mov	ah, bh
	shl	eax, 16
	mov	al, bl
	mov	ah, cl
	mov	[edi], eax
	mov	al, bh		; 0311
	mov	ah, bh
	shl	eax, 16
	mov	al, bl
	mov	ah, ch
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2020 (4), 0202 (5)
	mov	ah, bl
	shl	eax, 16
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	ror	eax, 8
	mov	[edi+edx], eax
	mov	al, ch		; 3131, 1313
	mov	ah, bh
	shl	eax, 16
	mov	al, ch
	mov	ah, bh
	mov	[edi+4], eax
	ror	eax, 8
	mov	[edi+edx+4], eax
	lea	edi, [edi+edx*2]

	mov	al, cl		; 2320 (6)
	mov	ah, bl
	shl	eax, 16
	mov	al, cl
	mov	ah, ch
	mov	[edi], eax
	mov	al, ch		; 2331
	mov	ah, bh
	shl	eax, 16
	mov	al, cl
	mov	ah, ch
	mov	[edi+4], eax
	add	edi, edx

	rol	eax, 8		; 2033 (7)
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	mov	al, cl		; 2123
	mov	ah, ch
	shl	eax, 16
	mov	al, cl
	mov	ah, bh
	mov	[edi+4], eax
	add	edi, edx

	mov	ah, cl		; 2223 (8)
	mov	[edi], eax
	mov	al, ch		; 3233
	mov	ah, ch
	shl	eax, 16
	mov	al, ch
	mov	ah, cl
	mov	[edi+4], eax
	
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	retn

; Horizontal blend
; 0        1
;   00010111	1
;   20101301	2
;   02010131	3
;   21201033	4
;   02032113	5
;   20323321	6
;   02232313	7
;2  23223233	8

nf0_h:
				; 3412  (low to high)
				;------
	mov	al, bl		; 0001	(1)
	mov	ah, bh
	shl	eax, 16
	mov	al, bl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 0111
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	ror	eax, 8		; 2010 (2)
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	rol	eax, 8
	mov	al, bh		; 1301
	mov	ah, ch
	mov	[edi+4], eax
	add	edi, edx

	mov	al, bl		; 0201 (3)
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 0131
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2120 (4)
	mov	ah, bl
	shl	eax, 16
	mov	al, cl
	mov	ah, bh
	mov	[edi], eax
	mov	al, ch		; 1033
	mov	ah, ch
	shl	eax, 16
	mov	al, bh
	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

	rol	eax, 8		; 0203 (5)
	mov	al, bl
	mov	ah, cl
	mov	[edi], eax
	mov	al, bh		; 2113
	mov	ah, ch
	shl	eax, 16
	mov	al, cl
	mov	ah, bh
	mov	[edi+4], eax
	add	edi, edx

	ror	eax, 8		; 2032 (6)
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 3321
	mov	ah, ch
	ror	eax, 8
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 0223 (7)
	mov	ah, ch
	shl	eax, 16
	mov	al, bl
	mov	ah, cl
	mov	[edi], eax
	mov	al, bh		; 2313
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

        shl	eax, 16		; 2322 (8)
	mov	al, cl
	mov	ah, ch
	mov	[edi], eax
	mov	al, ch		; 3233
	mov	ah, ch
	shl	eax, 16
	mov	al, ch
	mov	ah, cl
	mov	[edi+4], eax
	
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	retn

else

; Vertical blend
;0        1
;  00101011	1
;  00010111	2
;  20203131	3
;  02021313	4
;  20203131	5
;  02021313	6
;  22323233	7
;2 22232333	8
;
nf0_v:
	push	ebp
				; 3412  (low to high)
				;------
	mov	al, bh		; 0010	(1)
	mov	ah, bl
	shl	eax, 16
	mov	al, bl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 1011
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	rol	eax, 8		; 0001 (2)
	mov	al, bl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 0111
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2020 (3+5)
	mov	ah, bl
	shl	eax, 16
	mov	al, cl
	mov	ah, bl
	mov	ebp, eax
	mov	[edi], eax
	mov	[edi+edx*2], eax
	mov	al, ch		; 3131
	mov	ah, bh
	shl	eax, 16
	mov	al, ch
	mov	ah, bh
	mov	[edi+4], eax
	mov	[edi+edx*2+4], eax
	add	edi, edx

	rol	ebp, 8		; 0202 (4+6)
	mov	[edi], ebp
	mov	[edi+edx*2], ebp
	rol	eax, 8		; 1313
	mov	[edi+4], eax
	mov	[edi+edx*2+4], eax
	add	edi, edx
	lea	edi, [edi+edx*2]

	mov	al, ch		; 2232 (7)
	mov	ah, cl
	shl	eax, 16
	mov	al, cl
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 3233
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2223 (8)
	mov	ah, ch
	shl	eax, 16
	mov	al, cl
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 2333
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax
	
	pop	ebp

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	retn

; Horizontal blend
;0        1
;  00101011	1
;  00010111	2
;  20101031	3
;  02010113	4
;  20323231	5
;  02232313	6
;  22323233	7
;2 22232333	8
;
nf0_h:
				; 3412  (low to high)
				;------
	mov	al, bh		; 0010	(1)
	mov	ah, bl
	shl	eax, 16
	mov	al, bl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 1011
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	rol	eax, 8		; 0001 (2)
	mov	al, bl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 0111
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	ror	eax, 8		; 2010 (3)
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	mov	al, ch		; 1031
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	mov	al, bl		; 0201 (4)
	mov	ah, bh
	rol	eax, 16
	mov	al, bl
	mov	ah, cl
	mov	[edi], eax
	mov	al, bh		; 0113
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	mov	al, ch		; 2032 (5)
	mov	ah, cl
	shl	eax, 16
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	mov	al, ch		; 3231
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	rol	eax, 8		; 0223 (6)
	mov	al, bl
	mov	ah, cl
	mov	[edi], eax
	mov	al, bh		; 2313
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	mov	al, ch		; 2232 (7)
	mov	ah, cl
	shl	eax, 16
	mov	al, cl
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 3233
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2223 (8)
	mov	ah, ch
	shl	eax, 16
	mov	al, cl
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 2333
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax
	
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	retn
endif

; \ Diagonal blend
;0        1
;  00010101	1
;  00001313	2
;  20303101	3
;  02030313	4
;  23203031	5
;  02020333	6
;  23232333	7
;2 22023233	8
;
nf0_d:
				; 3412  (low to high)
				;------
	mov	al, bl		; 0001	(1)
	mov	ah, bh
	shl	eax, 16
	mov	al, bl
	mov	ah, bl
	mov	[edi], eax
	mov	ah, bh		; 0101
	mov	[edi+4], eax
	add	edi, edx

	mov	ah, bl		; 0000	(2)
	rol	eax, 16
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 1313
	mov	ah, ch
	shl	eax, 16
	mov	al, bh
	mov	ah, ch
	mov	[edi+4], eax
	add	edi, edx

	mov	al, ch		; 2030 (3)
	mov	ah, bl
	shl	eax, 16
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bl		; 3101
	mov	ah, bh
	shl	eax, 16
	mov	al, ch
	mov	ah, bh
	mov	[edi+4], eax
	add	edi, edx

	mov	al, bl		; 0203 (4)
	mov	ah, ch
	shl	eax, 16
	mov	al, bl
	mov	ah, cl
	mov	[edi], eax
	mov	al, bh		; 0313
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2320 (5)
	mov	ah, bl
	shl	eax, 16
	mov	al, cl
	mov	ah, ch
	mov	[edi], eax
	mov	al, ch		; 3031
	mov	ah, bh
	shl	eax, 16
	mov	al, ch
	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

	mov	al, bl		; 0202 (6)
	mov	ah, cl
	shl	eax, 16
	mov	al, bl
	mov	ah, cl
	mov	[edi], eax
	mov	ah, ch		; 0333
	shl	eax, 16
	mov	al, ch
	mov	ah, ch
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2323 (7)
	rol	eax, 16
	mov	al, cl
	mov	[edi], eax
	mov	al, ch		; 2333
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	mov	al, bl		; 2202 (8)
	mov	ah, cl
	shl	eax, 16
	mov	al, cl
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 3233
	mov	ah, ch
	shl	eax, 16
	mov	al, ch
	mov	ah, cl
	mov	[edi+4], eax

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	retn


; / RDiagonal blend
;0        1
;  01010111	1
;  20201111	2
;  01021313	3
;  20212131	4
;  02121323	5
;  22213131	6
;  22232323	7
;2 22323133	8
;
nf0_r:
				; 3412  (low to high)
				;------
	mov	al, bl		; 0101	(1)
	mov	ah, bh
	shl	eax, 16
	mov	al, bl
	mov	ah, bh
	mov	[edi], eax
	mov	al, bh		; 0111
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2020 (2)
	mov	ah, bl
	shl	eax, 16
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 1111
	mov	ah, bh
	shl	eax, 16
	mov	al, bh
	mov	ah, bh
	mov	[edi+4], eax
	add	edi, edx

	mov	al, bl		; 0102 (3)
	mov	ah, cl
	rol	eax, 16
	mov	al, bl
	mov	[edi], eax
	mov	al, bh		; 1313
	mov	ah, ch
	shl	eax, 16
	mov	al, bh
	mov	ah, ch
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2021 (4)
	mov	ah, bh
	shl	eax, 16
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	mov	al, ch		; 2131
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	ror	eax, 8		; 0212 (5)
	mov	al, bl
	mov	ah, cl
	mov	[edi], eax
	mov	al, cl		; 1323
	mov	ah, ch
	shl	eax, 16
	mov	al, bh
	mov	ah, ch
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2221 (6)
	mov	ah, bh
	shl	eax, 16
	mov	al, cl
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 3131
	mov	ah, bh
	shl	eax, 16
	mov	al, ch
	mov	ah, bh
	mov	[edi+4], eax
	add	edi, edx

	mov	al, cl		; 2223 (7)
	mov	ah, ch
	shl	eax, 16
	mov	al, cl
	mov	ah, cl
	mov	[edi], eax
	mov	ah, ch		; 2323
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

	rol	eax, 8		; 2232 (8)
	mov	al, cl
	mov	[edi], eax
	mov	al, ch		; 3133
	mov	ah, ch
	shl	eax, 16
	mov	al, ch
	mov	ah, bh
	mov	[edi+4], eax

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	retn

elseif INTERP eq 1

nf0:	; Interpolated (1 byte)
	mov	edx, nf_width
	sub	edi, edx		; Get four corner colors
	sub	edi, edx ;xxx
	mov	bl, [edi-1]		;  into bl,bh,cl,ch
	mov	bh, [edi+7]
	mov	cl, [edi+edx*8-1]
	mov	ch, [esi]
	inc	esi
	add	edi, edx ;xxx
	add	edi, edx

; Pattern for interpolating four corners:
;0        1
;  00101011	1
;  00010111	2
;  20023113	3
;  02101031	4
;  20323213	5
;  02201331	6
;  22232333	7
;2 22323233	8
				; 3412  (low to high)
				;------
nf0_1:
	mov	al, bh		; 0010
	mov	ah, bl
	shl	eax, 16
	mov	al, bl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 1011
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

nf0_2:	rol	eax, 8		; 0001
	mov	al, bl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 0111
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

nf0_3:	mov	al, bl		; 2002
	mov	ah, cl
	shl	eax, 16
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 3113
	mov	ah, ch
	shl	eax, 16
	mov	al, ch
	mov	ah, bh
	mov	[edi+4], eax
	add	edi, edx

nf0_4:	mov	al, bh		; 0210
	mov	ah, bl
	shl	eax, 16
	mov	al, bl
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 1031
	mov	ah, bh
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

nf0_5:	mov	al, cl		; 2032
	mov	ah, ch
	shl	eax, 16
	mov	al, cl
	mov	ah, bl
	mov	[edi], eax
	mov	al, bh		; 3213
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

nf0_6:	mov	al, cl		; 0220
	mov	ah, bl
	shl	eax, 16
	mov	al, bl
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 1331
	mov	ah, bh
	shl	eax, 16
	mov	al, bh
	mov	ah, ch
	mov	[edi+4], eax
	add	edi, edx

nf0_7:	mov	al, cl		; 2223
	mov	ah, ch
	shl	eax, 16
	mov	al, cl
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 2333
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax
	add	edi, edx

nf0_8:	ror	eax, 8		; 2232
	mov	al, cl
	mov	ah, cl
	mov	[edi], eax
	mov	al, ch		; 3233
	mov	ah, ch
	rol	eax, 16
	mov	[edi+4], eax

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

endif

;----------------------------------------
	ALIGN 4
nf1:	; No change (and copied to screen)
	add	edi, SWIDTH
	retn

;----------------------------------------
	ALIGN 4
nf2:	; Near shift from older part of current buffer
	xor	eax, eax
	mov	al, [esi]
	inc	esi
	mov	ax, nfpk_ShiftP2[eax*2]
nf_xyc_shift:
	xor	ebx, ebx
	mov	bl, ah
	shl	eax, 24
	sar	eax, 24
	add	eax, nfpk_ShiftY[ebx*4]
	jmp	nf_shift

;----------------------------------------
	ALIGN 4
nf3:	; Near shift from newer part of current buffer
	xor	eax, eax
	mov	al, [esi]
	inc	esi
	mov	ax, nfpk_ShiftP2[eax*2]
	neg	al
	neg	ah
	jmp	nf_xyc_shift

;----------------------------------------
	ALIGN 4
nf4:	; Near shift from previous buffer
	xor	eax, eax
	mov	al, [esi]
	inc	esi
	mov	ax, nfpk_ShiftP1[eax*2]
	jmp	nf_xyp_shift

;----------------------------------------
	ALIGN 4
nf5:	; Far shift from previous buffer
	mov	ax, [esi]
	add	esi, 2
nf_xyp_shift:
	xor	ebx, ebx
	mov	bl, ah
	shl	eax, 24
	sar	eax, 24
	add	eax, nfpk_ShiftY[ebx*4]
	add	eax, DiffBufPtrs
	jmp	nf_shift

;----------------------------------------
	ALIGN 4

if COMPOPS

nf6:	; Run of no changes (must only appear in first nibble opcodes)
	; Next nibble k specifies 2k+4 squares with no changes
	add	esp, 4			; Next nibble is not an opcode
	add	ebx, 2			; (minimum of 4 squares)
	ALIGN 4
nf6a:	add	edi, SWIDTH*2		; Advance over two squares
	dec	ebx
	jz	nf6z			; Last pair of squares
	dec	wcnt			; Same row?
	jns	nf6a			;  Yes
	add	edi, new_row		; Advance to next row
	dec	h			; Decrement row count (should never become zero here)
	mov	eax, w			; Reset wcnt
	shr	eax ,1
	dec	eax
	mov	wcnt, eax
	jmp	nf6a

nf6z:	retn

else

nf6:	; Far shift from current buffer
	mov	ax, [esi]
	add	esi, 2
	jmp	nf_xyc_shift

endif

;----------------------------------------
	ALIGN 4
nf_shift:
if 0 ;debug
	mov	eax, 0
	mov	ebx, eax
	jmp	nf_solid
endif
	mov	ebx, esi	; save esi
	lea	esi, [edi+eax]
	mov	edx, nf_width

	REPEAT 7
	  mov eax, [esi]
	  mov [edi], eax
	  mov eax, [esi+4]
	  mov [edi+4], eax
	  add	esi, edx
	  add	edi, edx
	ENDM
	mov eax, [esi]
	mov [edi], eax
	mov eax, [esi+4]
	mov [edi+4], eax

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	mov	esi, ebx	; restore esi
	retn

;----------------------------------------
	ALIGN 4
nf7:	;     8x8x1 (10 bytes)

	mov	ax, [esi]
	cmp	al, ah
	ja	nf23

if 0 ;debug
	add	esi, 10
	mov	eax, 0fefefefeH
	mov	ebx, eax
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf7_11+2

	mov	al, [esi+2]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_11-nf7_11)], bl
	mov	[edx+(nf7_12-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_13-nf7_11)], bl
	mov	[edx+(nf7_14-nf7_11)], bh

	mov	al, [esi+3]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_21-nf7_11)], bl
	mov	[edx+(nf7_22-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_23-nf7_11)], bl
	mov	[edx+(nf7_24-nf7_11)], bh

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_31-nf7_11)], bl
	mov	[edx+(nf7_32-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_33-nf7_11)], bl
	mov	[edx+(nf7_34-nf7_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_41-nf7_11)], bl
	mov	[edx+(nf7_42-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_43-nf7_11)], bl
	mov	[edx+(nf7_44-nf7_11)], bh

	lea	edx, [edx+(nf7_51-nf7_11)]

	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_51-nf7_51)], bl
	mov	[edx+(nf7_52-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_53-nf7_51)], bl
	mov	[edx+(nf7_54-nf7_51)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_61-nf7_51)], bl
	mov	[edx+(nf7_62-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_63-nf7_51)], bl
	mov	[edx+(nf7_64-nf7_51)], bh

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_71-nf7_51)], bl
	mov	[edx+(nf7_72-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_73-nf7_51)], bl
	mov	[edx+(nf7_74-nf7_51)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_81-nf7_51)], bl
	mov	[edx+(nf7_82-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_83-nf7_51)], bl
	mov	[edx+(nf7_84-nf7_51)], bh

	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi,nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax
	jmp	nf7_0		; flush prefetch
	ALIGN 4
nf7_0:
nf7_11:	mov	ax, bx
	shl	eax, 16
nf7_12:	mov	ax, bx
	mov	[edi], eax
nf7_13:	mov	ax, bx
	shl	eax, 16
nf7_14:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_21:	mov	ax, bx
	shl	eax, 16
nf7_22:	mov	ax, bx
	mov	[edi], eax
nf7_23:	mov	ax, bx
	shl	eax, 16
nf7_24:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_31:	mov	ax, bx
	shl	eax, 16
nf7_32:	mov	ax, bx
	mov	[edi], eax
nf7_33:	mov	ax, bx
	shl	eax, 16
nf7_34:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_41:	mov	ax, bx
	shl	eax, 16
nf7_42:	mov	ax, bx
	mov	[edi], eax
nf7_43:	mov	ax, bx
	shl	eax, 16
nf7_44:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_51:	mov	ax, bx
	shl	eax, 16
nf7_52:	mov	ax, bx
	mov	[edi], eax
nf7_53:	mov	ax, bx
	shl	eax, 16
nf7_54:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_61:	mov	ax, bx
	shl	eax, 16
nf7_62:	mov	ax, bx
	mov	[edi], eax
nf7_63:	mov	ax, bx
	shl	eax, 16
nf7_64:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_71:	mov	ax, bx
	shl	eax, 16
nf7_72:	mov	ax, bx
	mov	[edi], eax
nf7_73:	mov	ax, bx
	shl	eax, 16
nf7_74:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_81:	mov	ax, bx
	shl	eax, 16
nf7_82:	mov	ax, bx
	mov	[edi], eax
nf7_83:	mov	ax, bx
	shl	eax, 16
nf7_84:	mov	ax, bx
	mov	[edi+4], eax

	pop	esi
	pop	ebp
	add	esi, 10
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf7+16
nf23:	; low 4x4x1 (4 bytes)

	xor	eax, eax
	lea	ecx, nfpk_mov4l
	lea	edx, byte ptr ds:nf23_11+2

	mov	al, [esi+2]
	and	al, 0fH
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_11-nf23_11)], bl
	mov	[edx+(nf23_12-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_13-nf23_11)], bl
	mov	[edx+(nf23_14-nf23_11)], bh

	mov	al, [esi+2]
	shr	al, 4
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_31-nf23_11)], bl
	mov	[edx+(nf23_32-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_33-nf23_11)], bl
	mov	[edx+(nf23_34-nf23_11)], bh


	mov	al, [esi+3]
	and	al, 0fH
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_51-nf23_11)], bl
	mov	[edx+(nf23_52-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_53-nf23_11)], bl
	mov	[edx+(nf23_54-nf23_11)], bh

	mov	al, [esi+3]
	shr	al, 4
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_71-nf23_11)], bl
	mov	[edx+(nf23_72-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_73-nf23_11)], bl
	mov	[edx+(nf23_74-nf23_11)], bh

	mov	edx, nf_width

	; load bx,cx with 00,11 color combinations
	mov	bx, [esi]
	mov	cl, bh
	mov	bh, bl
	mov	ch, cl

	jmp	nf23_0		; flush prefetch
	ALIGN 4
nf23_0:

nf23_11:mov	ax, bx
	shl	eax, 16
nf23_12:mov	ax, bx
	mov	[edi], eax
	mov	[edi+edx], eax

nf23_13:mov	ax, bx
	shl	eax, 16
nf23_14:mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	lea	edi, [edi+edx*2]

nf23_31:mov	ax, bx
	shl	eax, 16
nf23_32:mov	ax, bx
	mov	[edi], eax
	mov	[edi+edx], eax

nf23_33:mov	ax, bx
	shl	eax, 16
nf23_34:mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	lea	edi, [edi+edx*2]

nf23_51:mov	ax, bx
	shl	eax, 16
nf23_52:mov	ax, bx
	mov	[edi], eax
	mov	[edi+edx], eax

nf23_53:mov	ax, bx
	shl	eax, 16
nf23_54:mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	lea	edi, [edi+edx*2]

nf23_71:mov	ax, bx
	shl	eax, 16
nf23_72:mov	ax, bx
	mov	[edi], eax
	mov	[edi+edx], eax

nf23_73:mov	ax, bx
	shl	eax, 16
nf23_74:mov	ax, bx
	mov	[edi+4], eax
	add	edi, edx
	mov	[edi+4], eax

	sub	edi, nfpk_back_right
	add	esi, 4
	retn

;----------------------------------------
	ALIGN 4
nf8:	; 2x2 4x4x1 (16 bytes)

	mov	ax, [esi]
	cmp	al, ah
	ja	nf24

	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf8_11+2	

	mov	al, [esi+2]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_11-nf8_11)], bl
	mov	[edx+(nf8_12-nf8_11)], bh
	shr	ebx, 16
	mov	[edx+(nf8_13-nf8_11)], bl
	mov	[edx+(nf8_14-nf8_11)], bh

	mov	al, [esi+3]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_21-nf8_11)], bl
	mov	[edx+(nf8_22-nf8_11)], bh
	shr	ebx, 16
	mov	[edx+(nf8_23-nf8_11)], bl
	mov	[edx+(nf8_24-nf8_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_31-nf8_11)], bl
	mov	[edx+(nf8_32-nf8_11)], bh
	shr	ebx, 16
	mov	[edx+(nf8_33-nf8_11)], bl
	mov	[edx+(nf8_34-nf8_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_41-nf8_11)], bl
	mov	[edx+(nf8_42-nf8_11)], bh
	shr	ebx, 16
	mov	[edx+(nf8_43-nf8_11)], bl
	mov	[edx+(nf8_44-nf8_11)], bh

	add	edx, nf8_51-nf8_11

	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_51-nf8_51)], bl
	mov	[edx+(nf8_52-nf8_51)], bh
	shr	ebx, 16
	mov	[edx+(nf8_53-nf8_51)], bl
	mov	[edx+(nf8_54-nf8_51)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_61-nf8_51)], bl
	mov	[edx+(nf8_62-nf8_51)], bh
	shr	ebx, 16
	mov	[edx+(nf8_63-nf8_51)], bl
	mov	[edx+(nf8_64-nf8_51)], bh


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_71-nf8_51)], bl
	mov	[edx+(nf8_72-nf8_51)], bh
	shr	ebx, 16
	mov	[edx+(nf8_73-nf8_51)], bl
	mov	[edx+(nf8_74-nf8_51)], bh

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_81-nf8_51)], bl
	mov	[edx+(nf8_82-nf8_51)], bh
	shr	ebx, 16
	mov	[edx+(nf8_83-nf8_51)], bl
	mov	[edx+(nf8_84-nf8_51)], bh


	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi, nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

	jmp	nf8_0		; flush prefetch
	ALIGN 4
nf8_0:
nf8_11:	mov	ax, bx
	shl	eax, 16
nf8_12:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf8_13:	mov	ax, bx
	shl	eax, 16
nf8_14:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_21:	mov	ax, bx
	shl	eax, 16
nf8_22:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf8_23:	mov	ax, bx
	shl	eax, 16
nf8_24:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	mov	eax, [esp]
	mov	cx, [eax+4]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf8_31:	mov	ax, bx
	shl	eax, 16
nf8_32:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf8_33:	mov	ax, bx
	shl	eax, 16
nf8_34:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_41:	mov	ax, bx
	shl	eax, 16
nf8_42:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf8_43:	mov	ax, bx
	shl	eax, 16
nf8_44:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	lea	eax, [esi*8-4]
	sub	edi, eax

	mov	eax, [esp]
	mov	cx, [eax+8]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf8_51:	mov	ax, bx
	shl	eax, 16
nf8_52:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf8_53:	mov	ax, bx
	shl	eax, 16
nf8_54:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_61:	mov	ax, bx
	shl	eax, 16
nf8_62:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf8_63:	mov	ax, bx
	shl	eax, 16
nf8_64:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	mov	eax, [esp]
	mov	cx, [eax+12]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf8_71:	mov	ax, bx
	shl	eax, 16
nf8_72:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf8_73:	mov	ax, bx
	shl	eax, 16
nf8_74:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_81:	mov	ax, bx
	shl	eax, 16
nf8_82:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf8_83:	mov	ax, bx
	shl	eax, 16
nf8_84:	mov	ax, bx
	mov	[edi], eax

	pop	esi
	pop	ebp
	add	esi, 16
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf8+16
nf24:	; 2x1 4x8x1 (12 bytes)

	mov	ax, [esi+6]
	cmp	al, ah
	ja	nf40

	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf24_11+2	

	mov	al, [esi+2]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_11-nf24_11)], bl
	mov	[edx+(nf24_12-nf24_11)], bh
	shr	ebx, 16
	mov	[edx+(nf24_13-nf24_11)], bl
	mov	[edx+(nf24_14-nf24_11)], bh

	mov	al, [esi+3]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_21-nf24_11)], bl
	mov	[edx+(nf24_22-nf24_11)], bh
	shr	ebx, 16
	mov	[edx+(nf24_23-nf24_11)], bl
	mov	[edx+(nf24_24-nf24_11)], bh


	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_31-nf24_11)], bl
	mov	[edx+(nf24_32-nf24_11)], bh
	shr	ebx, 16
	mov	[edx+(nf24_33-nf24_11)], bl
	mov	[edx+(nf24_34-nf24_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_41-nf24_11)], bl
	mov	[edx+(nf24_42-nf24_11)], bh
	shr	ebx, 16
	mov	[edx+(nf24_43-nf24_11)], bl
	mov	[edx+(nf24_44-nf24_11)], bh

	add	edx, nf24_51-nf24_11

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_51-nf24_51)], bl
	mov	[edx+(nf24_52-nf24_51)], bh
	shr	ebx, 16
	mov	[edx+(nf24_53-nf24_51)], bl
	mov	[edx+(nf24_54-nf24_51)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_61-nf24_51)], bl
	mov	[edx+(nf24_62-nf24_51)], bh
	shr	ebx, 16
	mov	[edx+(nf24_63-nf24_51)], bl
	mov	[edx+(nf24_64-nf24_51)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_71-nf24_51)], bl
	mov	[edx+(nf24_72-nf24_51)], bh
	shr	ebx, 16
	mov	[edx+(nf24_73-nf24_51)], bl
	mov	[edx+(nf24_74-nf24_51)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_81-nf24_51)], bl
	mov	[edx+(nf24_82-nf24_51)], bh
	shr	ebx, 16
	mov	[edx+(nf24_83-nf24_51)], bl
	mov	[edx+(nf24_84-nf24_51)], bh


	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi, nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

	jmp	nf24_0		; flush prefetch
	ALIGN 4
nf24_0:
nf24_11:mov	ax, bx
	shl	eax, 16
nf24_12:mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf24_13:mov	ax, bx
	shl	eax, 16
nf24_14:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_21:mov	ax, bx
	shl	eax, 16
nf24_22:mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf24_23:mov	ax, bx
	shl	eax, 16
nf24_24:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_31:mov	ax, bx
	shl	eax, 16
nf24_32:mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf24_33:mov	ax, bx
	shl	eax, 16
nf24_34:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_41:mov	ax, bx
	shl	eax, 16
nf24_42:mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf24_43:mov	ax, bx
	shl	eax, 16
nf24_44:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	lea	eax, [esi*8-4]
	sub	edi, eax

	mov	eax, [esp]
	mov	cx, [eax+6]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf24_51:mov	ax, bx
	shl	eax, 16
nf24_52:mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf24_53:mov	ax, bx
	shl	eax, 16
nf24_54:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_61:mov	ax, bx
	shl	eax, 16
nf24_62:mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf24_63:mov	ax, bx
	shl	eax, 16
nf24_64:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_71:mov	ax, bx
	shl	eax, 16
nf24_72:mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf24_73:mov	ax, bx
	shl	eax, 16
nf24_74:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_81:mov	ax, bx
	shl	eax, 16
nf24_82:mov	ax, bx
	mov	[edi], eax
	add	edi, esi
nf24_83:mov	ax, bx
	shl	eax, 16
nf24_84:mov	ax, bx
	mov	[edi], eax

	pop	esi
	pop	ebp
	add	esi, 12
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf8+32
nf40:	; 1x2 8x4x1 (12 bytes)

	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf40_11+2	

	mov	al, [esi+2]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_11-nf40_11)], bl
	mov	[edx+(nf40_12-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_13-nf40_11)], bl
	mov	[edx+(nf40_14-nf40_11)], bh

	mov	al, [esi+3]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_21-nf40_11)], bl
	mov	[edx+(nf40_22-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_23-nf40_11)], bl
	mov	[edx+(nf40_24-nf40_11)], bh


	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_31-nf40_11)], bl
	mov	[edx+(nf40_32-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_33-nf40_11)], bl
	mov	[edx+(nf40_34-nf40_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_41-nf40_11)], bl
	mov	[edx+(nf40_42-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_43-nf40_11)], bl
	mov	[edx+(nf40_44-nf40_11)], bh

	add	edx, nf40_51-nf40_11

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_51-nf40_51)], bl
	mov	[edx+(nf40_52-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_53-nf40_51)], bl
	mov	[edx+(nf40_54-nf40_51)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_61-nf40_51)], bl
	mov	[edx+(nf40_62-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_63-nf40_51)], bl
	mov	[edx+(nf40_64-nf40_51)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_71-nf40_51)], bl
	mov	[edx+(nf40_72-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_73-nf40_51)], bl
	mov	[edx+(nf40_74-nf40_51)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_81-nf40_51)], bl
	mov	[edx+(nf40_82-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_83-nf40_51)], bl
	mov	[edx+(nf40_84-nf40_51)], bh


	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi, nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

	jmp	nf40_0		; flush prefetch
	ALIGN 4
nf40_0:
nf40_11:mov	ax, bx
	shl	eax, 16
nf40_12:mov	ax, bx
	mov	[edi], eax
nf40_13:mov	ax, bx
	shl	eax, 16
nf40_14:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf40_21:mov	ax, bx
	shl	eax, 16
nf40_22:mov	ax, bx
	mov	[edi], eax
nf40_23:mov	ax, bx
	shl	eax, 16
nf40_24:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf40_31:mov	ax, bx
	shl	eax, 16
nf40_32:mov	ax, bx
	mov	[edi], eax
nf40_33:mov	ax, bx
	shl	eax, 16
nf40_34:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf40_41:mov	ax, bx
	shl	eax, 16
nf40_42:mov	ax, bx
	mov	[edi], eax
nf40_43:mov	ax, bx
	shl	eax, 16
nf40_44:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

	mov	eax, [esp]
	mov	cx, [eax+6]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf40_51:mov	ax, bx
	shl	eax, 16
nf40_52:mov	ax, bx
	mov	[edi], eax
nf40_53:mov	ax, bx
	shl	eax, 16
nf40_54:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf40_61:mov	ax, bx
	shl	eax, 16
nf40_62:mov	ax, bx
	mov	[edi], eax
nf40_63:mov	ax, bx
	shl	eax, 16
nf40_64:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf40_71:mov	ax, bx
	shl	eax, 16
nf40_72:mov	ax, bx
	mov	[edi], eax
nf40_73:mov	ax, bx
	shl	eax, 16
nf40_74:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf40_81:mov	ax, bx
	shl	eax, 16
nf40_82:mov	ax, bx
	mov	[edi], eax
nf40_83:mov	ax, bx
	shl	eax, 16
nf40_84:mov	ax, bx
	mov	[edi+4], eax

	pop	esi
	pop	ebp
	add	esi, 12
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
nf9:	;     8x8x2 (20 bytes)

	mov	eax, [esi]
	cmp	al, ah
	ja	nf41

	shr	eax, 16
	cmp	al, ah
	ja	nf25

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf9_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_11-nf9_11)], bl
	mov	[edx+(nf9_12-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_13-nf9_11)], bl
	mov	[edx+(nf9_14-nf9_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_15-nf9_11)], bl
	mov	[edx+(nf9_16-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_17-nf9_11)], bl
	mov	[edx+(nf9_18-nf9_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_21-nf9_11)], bl
	mov	[edx+(nf9_22-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_23-nf9_11)], bl
	mov	[edx+(nf9_24-nf9_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_25-nf9_11)], bl
	mov	[edx+(nf9_26-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_27-nf9_11)], bl
	mov	[edx+(nf9_28-nf9_11)], bh


	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_31-nf9_11)], bl
	mov	[edx+(nf9_32-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_33-nf9_11)], bl
	mov	[edx+(nf9_34-nf9_11)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_35-nf9_11)], bl
	mov	[edx+(nf9_36-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_37-nf9_11)], bl
	mov	[edx+(nf9_38-nf9_11)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_41-nf9_11)], bl
	mov	[edx+(nf9_42-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_43-nf9_11)], bl
	mov	[edx+(nf9_44-nf9_11)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_45-nf9_11)], bl
	mov	[edx+(nf9_46-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_47-nf9_11)], bl
	mov	[edx+(nf9_48-nf9_11)], bh


	lea	edx, [edx+(nf9_51-nf9_11)]

	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_51-nf9_51)], bl
	mov	[edx+(nf9_52-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_53-nf9_51)], bl
	mov	[edx+(nf9_54-nf9_51)], bh

	mov	al, [esi+13]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_55-nf9_51)], bl
	mov	[edx+(nf9_56-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_57-nf9_51)], bl
	mov	[edx+(nf9_58-nf9_51)], bh


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_61-nf9_51)], bl
	mov	[edx+(nf9_62-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_63-nf9_51)], bl
	mov	[edx+(nf9_64-nf9_51)], bh

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_65-nf9_51)], bl
	mov	[edx+(nf9_66-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_67-nf9_51)], bl
	mov	[edx+(nf9_68-nf9_51)], bh


	mov	al, [esi+16]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_71-nf9_51)], bl
	mov	[edx+(nf9_72-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_73-nf9_51)], bl
	mov	[edx+(nf9_74-nf9_51)], bh

	mov	al, [esi+17]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_75-nf9_51)], bl
	mov	[edx+(nf9_76-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_77-nf9_51)], bl
	mov	[edx+(nf9_78-nf9_51)], bh


	mov	al, [esi+18]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_81-nf9_51)], bl
	mov	[edx+(nf9_82-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_83-nf9_51)], bl
	mov	[edx+(nf9_84-nf9_51)], bh

	mov	al, [esi+19]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_85-nf9_51)], bl
	mov	[edx+(nf9_86-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_87-nf9_51)], bl
	mov	[edx+(nf9_88-nf9_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf9_0		; flush prefetch
	ALIGN 4
nf9_0:
nf9_11:	mov	al, bl
nf9_12:	mov	ah, bl
	shl	eax, 16
nf9_13:	mov	al, bl
nf9_14:	mov	ah, bl
	mov	[edi], eax

nf9_15:	mov	al, bl
nf9_16:	mov	ah, bl
	shl	eax, 16
nf9_17:	mov	al, bl
nf9_18:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_21:	mov	al, bl
nf9_22:	mov	ah, bl
	shl	eax, 16
nf9_23:	mov	al, bl
nf9_24:	mov	ah, bl
	mov	[edi], eax

nf9_25:	mov	al, bl
nf9_26:	mov	ah, bl
	shl	eax, 16
nf9_27:	mov	al, bl
nf9_28:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_31:	mov	al, bl
nf9_32:	mov	ah, bl
	shl	eax, 16
nf9_33:	mov	al, bl
nf9_34:	mov	ah, bl
	mov	[edi], eax

nf9_35:	mov	al, bl
nf9_36:	mov	ah, bl
	shl	eax, 16
nf9_37:	mov	al, bl
nf9_38:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_41:	mov	al, bl
nf9_42:	mov	ah, bl
	shl	eax, 16
nf9_43:	mov	al, bl
nf9_44:	mov	ah, bl
	mov	[edi], eax

nf9_45:	mov	al, bl
nf9_46:	mov	ah, bl
	shl	eax, 16
nf9_47:	mov	al, bl
nf9_48:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_51:	mov	al, bl
nf9_52:	mov	ah, bl
	shl	eax, 16
nf9_53:	mov	al, bl
nf9_54:	mov	ah, bl
	mov	[edi], eax

nf9_55:	mov	al, bl
nf9_56:	mov	ah, bl
	shl	eax, 16
nf9_57:	mov	al, bl
nf9_58:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_61:	mov	al, bl
nf9_62:	mov	ah, bl
	shl	eax, 16
nf9_63:	mov	al, bl
nf9_64:	mov	ah, bl
	mov	[edi], eax

nf9_65:	mov	al, bl
nf9_66:	mov	ah, bl
	shl	eax, 16
nf9_67:	mov	al, bl
nf9_68:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_71:	mov	al, bl
nf9_72:	mov	ah, bl
	shl	eax, 16
nf9_73:	mov	al, bl
nf9_74:	mov	ah, bl
	mov	[edi], eax

nf9_75:	mov	al, bl
nf9_76:	mov	ah, bl
	shl	eax, 16
nf9_77:	mov	al, bl
nf9_78:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_81:	mov	al, bl
nf9_82:	mov	ah, bl
	shl	eax, 16
nf9_83:	mov	al, bl
nf9_84:	mov	ah, bl
	mov	[edi], eax

nf9_85:	mov	al, bl
nf9_86:	mov	ah, bl
	shl	eax, 16
nf9_87:	mov	al, bl
nf9_88:	mov	ah, bl
	mov	[edi+4], eax

	add	esi, 20
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
;nf9+16
nf25:	; low 4x4x2 (8 bytes)

if 0 ;debug
	mov	eax, 0
	mov	ebx, 0
	add	esi, 8
	jmp	nf_solid
endif

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf25_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_14-nf25_11)], bl
	mov	[edx+(nf25_13-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_12-nf25_11)], bl
	mov	[edx+(nf25_11-nf25_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_24-nf25_11)], bl
	mov	[edx+(nf25_23-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_22-nf25_11)], bl
	mov	[edx+(nf25_21-nf25_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_34-nf25_11)], bl
	mov	[edx+(nf25_33-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_32-nf25_11)], bl
	mov	[edx+(nf25_31-nf25_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_44-nf25_11)], bl
	mov	[edx+(nf25_43-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_42-nf25_11)], bl
	mov	[edx+(nf25_41-nf25_11)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf25_0		; flush prefetch
	ALIGN 4
nf25_0:
nf25_11:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_12:mov	al, bl
	mov	ah, al
	mov	[edi], eax
	mov	[edi+edx], eax
nf25_13:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_14:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	lea	edi, [edi+edx*2]

nf25_21:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_22:mov	al, bl
	mov	ah, al
	mov	[edi], eax
	mov	[edi+edx], eax
nf25_23:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_24:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	lea	edi, [edi+edx*2]

nf25_31:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_32:mov	al, bl
	mov	ah, al
	mov	[edi], eax
	mov	[edi+edx], eax
nf25_33:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_34:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	lea	edi, [edi+edx*2]

nf25_41:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_42:mov	al, bl
	mov	ah, al
	mov	[edi], eax
	mov	[edi+edx], eax
nf25_43:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_44:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	add	edi, edx

	add	esi, 8
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf9+32
nf41:	; low 4x8x2 (12 bytes)
	shr	eax, 16
	cmp	al, ah
	ja	nf57

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf41_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_14-nf41_11)], bl
	mov	[edx+(nf41_13-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_12-nf41_11)], bl
	mov	[edx+(nf41_11-nf41_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_24-nf41_11)], bl
	mov	[edx+(nf41_23-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_22-nf41_11)], bl
	mov	[edx+(nf41_21-nf41_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_34-nf41_11)], bl
	mov	[edx+(nf41_33-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_32-nf41_11)], bl
	mov	[edx+(nf41_31-nf41_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_44-nf41_11)], bl
	mov	[edx+(nf41_43-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_42-nf41_11)], bl
	mov	[edx+(nf41_41-nf41_11)], bh

	lea	edx, [edx+(nf41_51-nf41_11)]

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_54-nf41_51)], bl
	mov	[edx+(nf41_53-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_52-nf41_51)], bl
	mov	[edx+(nf41_51-nf41_51)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_64-nf41_51)], bl
	mov	[edx+(nf41_63-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_62-nf41_51)], bl
	mov	[edx+(nf41_61-nf41_51)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_74-nf41_51)], bl
	mov	[edx+(nf41_73-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_72-nf41_51)], bl
	mov	[edx+(nf41_71-nf41_51)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_84-nf41_51)], bl
	mov	[edx+(nf41_83-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_82-nf41_51)], bl
	mov	[edx+(nf41_81-nf41_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf41_0		; flush prefetch
	ALIGN 4
nf41_0:
nf41_11:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_12:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_13:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_14:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf41_21:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_22:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_23:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_24:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf41_31:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_32:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_33:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_34:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf41_41:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_42:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_43:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_44:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf41_51:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_52:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_53:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_54:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf41_61:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_62:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_63:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_64:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf41_71:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_72:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_73:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_74:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf41_81:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_82:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_83:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_84:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax

	add	esi, 12
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf9+48
nf57:	; low 8x4x2 (12 bytes)
	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf57_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_11-nf57_11)], bl
	mov	[edx+(nf57_12-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_13-nf57_11)], bl
	mov	[edx+(nf57_14-nf57_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_15-nf57_11)], bl
	mov	[edx+(nf57_16-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_17-nf57_11)], bl
	mov	[edx+(nf57_18-nf57_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_21-nf57_11)], bl
	mov	[edx+(nf57_22-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_23-nf57_11)], bl
	mov	[edx+(nf57_24-nf57_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_25-nf57_11)], bl
	mov	[edx+(nf57_26-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_27-nf57_11)], bl
	mov	[edx+(nf57_28-nf57_11)], bh


	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_31-nf57_11)], bl
	mov	[edx+(nf57_32-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_33-nf57_11)], bl
	mov	[edx+(nf57_34-nf57_11)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_35-nf57_11)], bl
	mov	[edx+(nf57_36-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_37-nf57_11)], bl
	mov	[edx+(nf57_38-nf57_11)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_41-nf57_11)], bl
	mov	[edx+(nf57_42-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_43-nf57_11)], bl
	mov	[edx+(nf57_44-nf57_11)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_45-nf57_11)], bl
	mov	[edx+(nf57_46-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_47-nf57_11)], bl
	mov	[edx+(nf57_48-nf57_11)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf57_0		; flush prefetch
	ALIGN 4
nf57_0:
nf57_11:mov	al, bl
nf57_12:mov	ah, bl
	shl	eax, 16
nf57_13:mov	al, bl
nf57_14:mov	ah, bl
	mov	[edi], eax
	mov	[edi+edx], eax

nf57_15:mov	al, bl
nf57_16:mov	ah, bl
	shl	eax, 16
nf57_17:mov	al, bl
nf57_18:mov	ah, bl
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	lea	edi, [edi+edx*2]

nf57_21:mov	al, bl
nf57_22:mov	ah, bl
	shl	eax, 16
nf57_23:mov	al, bl
nf57_24:mov	ah, bl
	mov	[edi], eax
	mov	[edi+edx], eax

nf57_25:mov	al, bl
nf57_26:mov	ah, bl
	shl	eax, 16
nf57_27:mov	al, bl
nf57_28:mov	ah, bl
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	lea	edi, [edi+edx*2]

nf57_31:mov	al, bl
nf57_32:mov	ah, bl
	shl	eax, 16
nf57_33:mov	al, bl
nf57_34:mov	ah, bl
	mov	[edi], eax
	mov	[edi+edx], eax

nf57_35:mov	al, bl
nf57_36:mov	ah, bl
	shl	eax, 16
nf57_37:mov	al, bl
nf57_38:mov	ah, bl
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	lea	edi, [edi+edx*2]

nf57_41:mov	al, bl
nf57_42:mov	ah, bl
	shl	eax, 16
nf57_43:mov	al, bl
nf57_44:mov	ah, bl
	mov	[edi], eax
	mov	[edi+edx], eax

nf57_45:mov	al, bl
nf57_46:mov	ah, bl
	shl	eax, 16
nf57_47:mov	al, bl
nf57_48:mov	ah, bl
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
	add	edi, edx

	add	esi, 12
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
nf10:	; 2x2 4x4x2 (32 bytes)

	mov	ax, [esi]
	cmp	al, ah
	ja	nf26

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf10_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_11-nf10_11)], bl
	mov	[edx+(nf10_12-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_13-nf10_11)], bl
	mov	[edx+(nf10_14-nf10_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_15-nf10_11)], bl
	mov	[edx+(nf10_16-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_17-nf10_11)], bl
	mov	[edx+(nf10_18-nf10_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_21-nf10_11)], bl
	mov	[edx+(nf10_22-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_23-nf10_11)], bl
	mov	[edx+(nf10_24-nf10_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_25-nf10_11)], bl
	mov	[edx+(nf10_26-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_27-nf10_11)], bl
	mov	[edx+(nf10_28-nf10_11)], bh


	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_31-nf10_11)], bl
	mov	[edx+(nf10_32-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_33-nf10_11)], bl
	mov	[edx+(nf10_34-nf10_11)], bh

	mov	al, [esi+13]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_35-nf10_11)], bl
	mov	[edx+(nf10_36-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_37-nf10_11)], bl
	mov	[edx+(nf10_38-nf10_11)], bh


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_41-nf10_11)], bl
	mov	[edx+(nf10_42-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_43-nf10_11)], bl
	mov	[edx+(nf10_44-nf10_11)], bh

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_45-nf10_11)], bl
	mov	[edx+(nf10_46-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_47-nf10_11)], bl
	mov	[edx+(nf10_48-nf10_11)], bh


	lea	edx, [edx+(nf10_51-nf10_11)]

	mov	al, [esi+20]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_51-nf10_51)], bl
	mov	[edx+(nf10_52-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_53-nf10_51)], bl
	mov	[edx+(nf10_54-nf10_51)], bh

	mov	al, [esi+21]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_55-nf10_51)], bl
	mov	[edx+(nf10_56-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_57-nf10_51)], bl
	mov	[edx+(nf10_58-nf10_51)], bh


	mov	al, [esi+22]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_61-nf10_51)], bl
	mov	[edx+(nf10_62-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_63-nf10_51)], bl
	mov	[edx+(nf10_64-nf10_51)], bh

	mov	al, [esi+23]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_65-nf10_51)], bl
	mov	[edx+(nf10_66-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_67-nf10_51)], bl
	mov	[edx+(nf10_68-nf10_51)], bh


	mov	al, [esi+28]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_71-nf10_51)], bl
	mov	[edx+(nf10_72-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_73-nf10_51)], bl
	mov	[edx+(nf10_74-nf10_51)], bh

	mov	al, [esi+29]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_75-nf10_51)], bl
	mov	[edx+(nf10_76-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_77-nf10_51)], bl
	mov	[edx+(nf10_78-nf10_51)], bh


	mov	al, [esi+30]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_81-nf10_51)], bl
	mov	[edx+(nf10_82-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_83-nf10_51)], bl
	mov	[edx+(nf10_84-nf10_51)], bh

	mov	al, [esi+31]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_85-nf10_51)], bl
	mov	[edx+(nf10_86-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_87-nf10_51)], bl
	mov	[edx+(nf10_88-nf10_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf10_0		; flush prefetch
	ALIGN 4
nf10_0:
nf10_11:mov	al, bl
nf10_12:mov	ah, bl
	shl	eax, 16
nf10_13:mov	al, bl
nf10_14:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_15:mov	al, bl
nf10_16:mov	ah, bl
	shl	eax, 16
nf10_17:mov	al, bl
nf10_18:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_21:mov	al, bl
nf10_22:mov	ah, bl
	shl	eax, 16
nf10_23:mov	al, bl
nf10_24:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_25:mov	al, bl
nf10_26:mov	ah, bl
	shl	eax, 16
nf10_27:mov	al, bl
nf10_28:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+8]
	mov	cx, [esi+10]

nf10_31:mov	al, bl
nf10_32:mov	ah, bl
	shl	eax, 16
nf10_33:mov	al, bl
nf10_34:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_35:mov	al, bl
nf10_36:mov	ah, bl
	shl	eax, 16
nf10_37:mov	al, bl
nf10_38:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_41:mov	al, bl
nf10_42:mov	ah, bl
	shl	eax, 16
nf10_43:mov	al, bl
nf10_44:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_45:mov	al, bl
nf10_46:mov	ah, bl
	shl	eax, 16
nf10_47:mov	al, bl
nf10_48:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	lea	eax, [edx*8-4]
	sub	edi, eax

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+16]
	mov	cx, [esi+18]

nf10_51:mov	al, bl
nf10_52:mov	ah, bl
	shl	eax, 16
nf10_53:mov	al, bl
nf10_54:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_55:mov	al, bl
nf10_56:mov	ah, bl
	shl	eax, 16
nf10_57:mov	al, bl
nf10_58:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_61:mov	al, bl
nf10_62:mov	ah, bl
	shl	eax, 16
nf10_63:mov	al, bl
nf10_64:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_65:mov	al, bl
nf10_66:mov	ah, bl
	shl	eax, 16
nf10_67:mov	al, bl
nf10_68:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+24]
	mov	cx, [esi+26]

nf10_71:mov	al, bl
nf10_72:mov	ah, bl
	shl	eax, 16
nf10_73:mov	al, bl
nf10_74:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_75:mov	al, bl
nf10_76:mov	ah, bl
	shl	eax, 16
nf10_77:mov	al, bl
nf10_78:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_81:mov	al, bl
nf10_82:mov	ah, bl
	shl	eax, 16
nf10_83:mov	al, bl
nf10_84:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_85:mov	al, bl
nf10_86:mov	ah, bl
	shl	eax, 16
nf10_87:mov	al, bl
nf10_88:mov	ah, bl
	mov	[edi], eax

	add	esi, 32
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf10+16
nf26:	; 2x1 4x8x2 (24 bytes)

	mov	ax, [esi+12]
	cmp	al, ah
	ja	nf42

if 0 ;debug
	mov	eax, 0
	mov	ebx, 0
	add	esi, 24
	jmp	nf_solid
endif

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf26_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_11-nf26_11)], bl
	mov	[edx+(nf26_12-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_13-nf26_11)], bl
	mov	[edx+(nf26_14-nf26_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_15-nf26_11)], bl
	mov	[edx+(nf26_16-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_17-nf26_11)], bl
	mov	[edx+(nf26_18-nf26_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_21-nf26_11)], bl
	mov	[edx+(nf26_22-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_23-nf26_11)], bl
	mov	[edx+(nf26_24-nf26_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_25-nf26_11)], bl
	mov	[edx+(nf26_26-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_27-nf26_11)], bl
	mov	[edx+(nf26_28-nf26_11)], bh

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_31-nf26_11)], bl
	mov	[edx+(nf26_32-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_33-nf26_11)], bl
	mov	[edx+(nf26_34-nf26_11)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_35-nf26_11)], bl
	mov	[edx+(nf26_36-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_37-nf26_11)], bl
	mov	[edx+(nf26_38-nf26_11)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_41-nf26_11)], bl
	mov	[edx+(nf26_42-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_43-nf26_11)], bl
	mov	[edx+(nf26_44-nf26_11)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_45-nf26_11)], bl
	mov	[edx+(nf26_46-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_47-nf26_11)], bl
	mov	[edx+(nf26_48-nf26_11)], bh


	lea	edx, [edx+(nf26_51-nf26_11)]

	mov	al, [esi+16]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_51-nf26_51)], bl
	mov	[edx+(nf26_52-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_53-nf26_51)], bl
	mov	[edx+(nf26_54-nf26_51)], bh

	mov	al, [esi+17]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_55-nf26_51)], bl
	mov	[edx+(nf26_56-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_57-nf26_51)], bl
	mov	[edx+(nf26_58-nf26_51)], bh


	mov	al, [esi+18]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_61-nf26_51)], bl
	mov	[edx+(nf26_62-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_63-nf26_51)], bl
	mov	[edx+(nf26_64-nf26_51)], bh

	mov	al, [esi+19]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_65-nf26_51)], bl
	mov	[edx+(nf26_66-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_67-nf26_51)], bl
	mov	[edx+(nf26_68-nf26_51)], bh


	mov	al, [esi+20]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_71-nf26_51)], bl
	mov	[edx+(nf26_72-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_73-nf26_51)], bl
	mov	[edx+(nf26_74-nf26_51)], bh

	mov	al, [esi+21]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_75-nf26_51)], bl
	mov	[edx+(nf26_76-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_77-nf26_51)], bl
	mov	[edx+(nf26_78-nf26_51)], bh


	mov	al, [esi+22]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_81-nf26_51)], bl
	mov	[edx+(nf26_82-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_83-nf26_51)], bl
	mov	[edx+(nf26_84-nf26_51)], bh

	mov	al, [esi+23]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_85-nf26_51)], bl
	mov	[edx+(nf26_86-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_87-nf26_51)], bl
	mov	[edx+(nf26_88-nf26_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf26_0		; flush prefetch
	ALIGN 4
nf26_0:
nf26_11:mov	al, bl
nf26_12:mov	ah, bl
	shl	eax, 16
nf26_13:mov	al, bl
nf26_14:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_15:mov	al, bl
nf26_16:mov	ah, bl
	shl	eax, 16
nf26_17:mov	al, bl
nf26_18:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_21:mov	al, bl
nf26_22:mov	ah, bl
	shl	eax, 16
nf26_23:mov	al, bl
nf26_24:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_25:mov	al, bl
nf26_26:mov	ah, bl
	shl	eax, 16
nf26_27:mov	al, bl
nf26_28:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_31:mov	al, bl
nf26_32:mov	ah, bl
	shl	eax, 16
nf26_33:mov	al, bl
nf26_34:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_35:mov	al, bl
nf26_36:mov	ah, bl
	shl	eax, 16
nf26_37:mov	al, bl
nf26_38:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_41:mov	al, bl
nf26_42:mov	ah, bl
	shl	eax, 16
nf26_43:mov	al, bl
nf26_44:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_45:mov	al, bl
nf26_46:mov	ah, bl
	shl	eax, 16
nf26_47:mov	al, bl
nf26_48:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	lea	eax, [edx*8-4]
	sub	edi, eax

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+12]
	mov	cx, [esi+14]

nf26_51:mov	al, bl
nf26_52:mov	ah, bl
	shl	eax, 16
nf26_53:mov	al, bl
nf26_54:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_55:mov	al, bl
nf26_56:mov	ah, bl
	shl	eax, 16
nf26_57:mov	al, bl
nf26_58:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_61:mov	al, bl
nf26_62:mov	ah, bl
	shl	eax, 16
nf26_63:mov	al, bl
nf26_64:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_65:mov	al, bl
nf26_66:mov	ah, bl
	shl	eax, 16
nf26_67:mov	al, bl
nf26_68:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_71:mov	al, bl
nf26_72:mov	ah, bl
	shl	eax, 16
nf26_73:mov	al, bl
nf26_74:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_75:mov	al, bl
nf26_76:mov	ah, bl
	shl	eax, 16
nf26_77:mov	al, bl
nf26_78:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_81:mov	al, bl
nf26_82:mov	ah, bl
	shl	eax, 16
nf26_83:mov	al, bl
nf26_84:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_85:mov	al, bl
nf26_86:mov	ah, bl
	shl	eax, 16
nf26_87:mov	al, bl
nf26_88:mov	ah, bl
	mov	[edi], eax

	add	esi, 24
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf10+32
nf42:	; 1x2 8x4x2 (24 bytes)

if 0 ;debug
	mov	eax, 0
	mov	ebx, 0
	add	esi, 24
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf42_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_11-nf42_11)], bl
	mov	[edx+(nf42_12-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_13-nf42_11)], bl
	mov	[edx+(nf42_14-nf42_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_15-nf42_11)], bl
	mov	[edx+(nf42_16-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_17-nf42_11)], bl
	mov	[edx+(nf42_18-nf42_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_21-nf42_11)], bl
	mov	[edx+(nf42_22-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_23-nf42_11)], bl
	mov	[edx+(nf42_24-nf42_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_25-nf42_11)], bl
	mov	[edx+(nf42_26-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_27-nf42_11)], bl
	mov	[edx+(nf42_28-nf42_11)], bh


	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_31-nf42_11)], bl
	mov	[edx+(nf42_32-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_33-nf42_11)], bl
	mov	[edx+(nf42_34-nf42_11)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_35-nf42_11)], bl
	mov	[edx+(nf42_36-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_37-nf42_11)], bl
	mov	[edx+(nf42_38-nf42_11)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_41-nf42_11)], bl
	mov	[edx+(nf42_42-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_43-nf42_11)], bl
	mov	[edx+(nf42_44-nf42_11)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_45-nf42_11)], bl
	mov	[edx+(nf42_46-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_47-nf42_11)], bl
	mov	[edx+(nf42_48-nf42_11)], bh


	lea	edx, [edx+(nf42_51-nf42_11)]

	mov	al, [esi+16]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_51-nf42_51)], bl
	mov	[edx+(nf42_52-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_53-nf42_51)], bl
	mov	[edx+(nf42_54-nf42_51)], bh

	mov	al, [esi+17]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_55-nf42_51)], bl
	mov	[edx+(nf42_56-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_57-nf42_51)], bl
	mov	[edx+(nf42_58-nf42_51)], bh


	mov	al, [esi+18]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_61-nf42_51)], bl
	mov	[edx+(nf42_62-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_63-nf42_51)], bl
	mov	[edx+(nf42_64-nf42_51)], bh

	mov	al, [esi+19]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_65-nf42_51)], bl
	mov	[edx+(nf42_66-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_67-nf42_51)], bl
	mov	[edx+(nf42_68-nf42_51)], bh


	mov	al, [esi+20]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_71-nf42_51)], bl
	mov	[edx+(nf42_72-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_73-nf42_51)], bl
	mov	[edx+(nf42_74-nf42_51)], bh

	mov	al, [esi+21]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_75-nf42_51)], bl
	mov	[edx+(nf42_76-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_77-nf42_51)], bl
	mov	[edx+(nf42_78-nf42_51)], bh


	mov	al, [esi+22]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_81-nf42_51)], bl
	mov	[edx+(nf42_82-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_83-nf42_51)], bl
	mov	[edx+(nf42_84-nf42_51)], bh

	mov	al, [esi+23]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_85-nf42_51)], bl
	mov	[edx+(nf42_86-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_87-nf42_51)], bl
	mov	[edx+(nf42_88-nf42_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf42_0		; flush prefetch
	ALIGN 4
nf42_0:
nf42_11:mov	al, bl
nf42_12:mov	ah, bl
	shl	eax, 16
nf42_13:mov	al, bl
nf42_14:mov	ah, bl
	mov	[edi], eax

nf42_15:mov	al, bl
nf42_16:mov	ah, bl
	shl	eax, 16
nf42_17:mov	al, bl
nf42_18:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf42_21:mov	al, bl
nf42_22:mov	ah, bl
	shl	eax, 16
nf42_23:mov	al, bl
nf42_24:mov	ah, bl
	mov	[edi], eax

nf42_25:mov	al, bl
nf42_26:mov	ah, bl
	shl	eax, 16
nf42_27:mov	al, bl
nf42_28:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf42_31:mov	al, bl
nf42_32:mov	ah, bl
	shl	eax, 16
nf42_33:mov	al, bl
nf42_34:mov	ah, bl
	mov	[edi], eax

nf42_35:mov	al, bl
nf42_36:mov	ah, bl
	shl	eax, 16
nf42_37:mov	al, bl
nf42_38:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf42_41:mov	al, bl
nf42_42:mov	ah, bl
	shl	eax, 16
nf42_43:mov	al, bl
nf42_44:mov	ah, bl
	mov	[edi], eax

nf42_45:mov	al, bl
nf42_46:mov	ah, bl
	shl	eax, 16
nf42_47:mov	al, bl
nf42_48:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+12]
	mov	cx, [esi+14]

nf42_51:mov	al, bl
nf42_52:mov	ah, bl
	shl	eax, 16
nf42_53:mov	al, bl
nf42_54:mov	ah, bl
	mov	[edi], eax

nf42_55:mov	al, bl
nf42_56:mov	ah, bl
	shl	eax, 16
nf42_57:mov	al, bl
nf42_58:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf42_61:mov	al, bl
nf42_62:mov	ah, bl
	shl	eax, 16
nf42_63:mov	al, bl
nf42_64:mov	ah, bl
	mov	[edi], eax

nf42_65:mov	al, bl
nf42_66:mov	ah, bl
	shl	eax, 16
nf42_67:mov	al, bl
nf42_68:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf42_71:mov	al, bl
nf42_72:mov	ah, bl
	shl	eax, 16
nf42_73:mov	al, bl
nf42_74:mov	ah, bl
	mov	[edi], eax

nf42_75:mov	al, bl
nf42_76:mov	ah, bl
	shl	eax, 16
nf42_77:mov	al, bl
nf42_78:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf42_81:mov	al, bl
nf42_82:mov	ah, bl
	shl	eax, 16
nf42_83:mov	al, bl
nf42_84:mov	ah, bl
	mov	[edi], eax

nf42_85:mov	al, bl
nf42_86:mov	ah, bl
	shl	eax, 16
nf42_87:mov	al, bl
nf42_88:mov	ah, bl
	mov	[edi+4], eax

	add	esi, 24
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
nf11:	;     8x8x8 (64 bytes)
if 0 ;debug
	add	esi, 64
	mov	eax, 0fefefefeH
;	mov	ebx, eax
	mov	ebx, 0
	jmp	nf_solid
endif
	mov	edx, nf_width

	mov eax, [esi]		;0
	mov [edi], eax
	mov eax, [esi+4]
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+8]	;1
	mov [edi], eax
	mov eax, [esi+12]
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+16]	;2
	mov [edi], eax
	mov eax, [esi+20]
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+24]	;3
	mov [edi], eax
	mov eax, [esi+28]
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+32]	;4
	mov [edi], eax
	mov eax, [esi+36]
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+40]	;5
	mov [edi], eax
	mov eax, [esi+44]
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+48]	;6
	mov [edi], eax
	mov eax, [esi+52]
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+56]	;7
	mov [edi], eax
	mov eax, [esi+60]
	mov [edi+4], eax

	add	esi, 64
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	retn

;----------------------------------------
	ALIGN 4
nf12:	; low 4x4x8 (16 bytes)
	mov	edx, nf_width

	mov	eax, [esi]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	mov	[edi+edx], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx
	mov	[edi+edx+4], ebx
	lea	edi, [edi+edx*2]

	mov	eax, [esi+4]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	mov	[edi+edx], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx
	mov	[edi+edx+4], ebx
	lea	edi, [edi+edx*2]

	mov	eax, [esi+8]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	mov	[edi+edx], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx
	mov	[edi+edx+4], ebx
	lea	edi, [edi+edx*2]

	mov	eax, [esi+12]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	mov	[edi+edx], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx
	mov	[edi+edx+4], ebx
	add	edi, edx

	sub	edi, nfpk_back_right
	add	esi, 16
	retn

;----------------------------------------
	ALIGN 4
nf13:	; 2x2 4x4x0 (4 bytes)
	mov	edx, nf_width

	mov	cl, [esi]
	mov	ch, cl
	mov	eax, ecx
	shl	eax, 16
	mov	ax, cx

	mov	cl, [esi+1]
	mov	ch, cl
	mov	ebx, ecx
	shl	ebx, 16
	mov	bx, cx

	mov	[edi], eax
	mov	[edi+4], ebx
	mov	[edi+edx], eax
	mov	[edi+edx+4], ebx
	lea	edi, [edi+edx*2]
	mov	[edi], eax
	mov	[edi+4], ebx
	mov	[edi+edx], eax
	mov	[edi+edx+4], ebx
	lea	edi, [edi+edx*2]

	mov	cl, [esi+2]
	mov	ch, cl
	mov	eax, ecx
	shl	eax, 16
	mov	ax, cx

	mov	cl, [esi+3]
	mov	ch, cl
	mov	ebx, ecx
	shl	ebx, 16
	mov	bx, cx

	mov	[edi], eax
	mov	[edi+4], ebx
	mov	[edi+edx], eax
	mov	[edi+edx+4], ebx
	lea	edi, [edi+edx*2]
	mov	[edi], eax
	mov	[edi+4], ebx
	add	edi, edx
	mov	[edi], eax
	mov	[edi+4], ebx

	sub	edi, nfpk_back_right
	add	esi, 4

	retn

;----------------------------------------
	ALIGN 4
nf14:	;     8x8x0 (1 byte)
if 0 ;debug
	jmp nf0
endif
	mov	bl, [esi]		; Copy color into 8 positions
	inc	esi
	mov	bh, bl
	mov	eax, ebx
	shl	eax, 16
	mov	ax, bx
	mov	ebx, eax
if 0 ;debug
	mov	eax, 080808080h
	mov	ebx, eax
endif
	jmp	nf_solid

	retn

;----------------------------------------
	ALIGN 4
nf15:	; mix 8x8x0 (2 bytes)
if 0 ;debug
	inc	esi
	jmp nf0
endif
	mov	bx, [esi]		; Copy 2 colors into 8 positions
	add	esi, 2			;  in a checkerboard
	mov	ax, bx
	shl	eax, 16
	mov	ax, bx
	mov	ebx, eax
	rol	ebx, 8
if 0 ;debug
	mov	eax, 080808080h
	mov	ebx, eax
endif
nf_solid:
	mov	edx, nf_width

	mov	[edi], eax
	mov	[edi+4], eax
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	add	edi, edx
	mov	[edi], eax
	mov	[edi+4], eax
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	add	edi, edx
	mov	[edi], eax
	mov	[edi+4], eax
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	add	edi, edx
	mov	[edi], eax
	mov	[edi+4], eax
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

nfPkDecomp ENDP

; Half vertical resolution version (skip odd lines)
;
nfPkDecompH PROC USES ESI EDI EBX, \
		ops:PTRBYTE, comp:PTRBYTE, \
		x:DWORD, y:DWORD, w:DWORD, h:DWORD
	LOCAL tbuf: PTRBYTE
	LOCAL new_row:DWORD
	LOCAL DiffBufPtrs:DWORD

	LOCAL nfpk_back_right: DWORD
	LOCAL wcnt:DWORD

	LOG_LABEL "StartPkDecomp"

.data
nfpk_OpTblH label dword
	dword offset nf0	; Prev Same (0)
	dword offset nf1	; No change (and copied to screen) (0)
	dword offset nf2	; Near shift from older part of current buf (1)
	dword offset nf3	; Near shift from newer part of current buf (1)
	dword offset nf4	; Near shift from previous buffer (1)
	dword offset nf5	; Far shift from previous buffer (2)
	dword offset nf6	; Far shift from current buffer (2)
				; [Or if COMPOPS, run of no changes (0)]
	dword offset nf7	;     8x8x1 (10 bytes) or low 4x4x1 (4 bytes)
	dword offset nf8	; 2x2 4x4x1 (16 bytes) or 2x1 4x8x1 (12 bytes) or 1x2 8x4x1 (12 bytes)
	dword offset nf9	;     8x8x2 (20 bytes) or low 4x4x2 (8 bytes) or
				; low 4x8x2 (12 bytes) or low 8x4x2 (12 bytes)
	dword offset nf10	; 2x2 4x4x2 (32 bytes) or 2x1 4x8x2 (24 bytes) or 1x2 4x8x2 (24 bytes)
	dword offset nf11	;     8x8x8 (64 bytes)
	dword offset nf12	; low 4x4x8 (16 bytes)
	dword offset nf13	; 2x2 4x4x0 (ie 2x2x8) (4 bytes)
	dword offset nf14	;     8x8x0 (1 byte)
	dword offset nf15	; mix 8x8x0 (2 bytes)
.code

ifdef SYMANTEC
	mov	ebx, ds		; Allow DS to access code
	mov	ecx, 0
	mov	ax, 3505h
	int 21h
endif

	NF_DECOMP_INIT	0

	mov	eax, nf_width
	shl	eax, 2
	sub	eax, nf_new_w
	mov	new_row, eax

	shr	nf_new_h, 1

	mov	eax, nf_width
	lea	eax, [eax*2+eax-SWIDTH]
	mov	nfpk_back_right, eax

	mov	esi, comp
	mov	edi, tbuf
nf_StartRow:
	mov	eax, w
	shr	eax, 1
	mov	wcnt,eax
	ALIGN 4
nf_NextPair:
	dec	wcnt
	js	nf_NextRow
	mov	ebx, ops
	mov	al, [ebx]
	inc	ebx
	mov	ops, ebx

	xor	ebx, ebx
	mov	bl, al
	shr	bl, 4
	and	eax, 0Fh
	push	offset nf_NextPair
	push	nfpk_OpTblH[ebx*4]
	jmp	nfpk_OpTblH[eax*4]

nf_NextRow:
	add	edi, new_row
	dec	h
	jnz	nf_StartRow
	LOG_LABEL "EndPkDecomp"

ifdef SYMANTEC
	mov	ebx, ds		; Disable DS from accessing code
	mov	ecx, offset DGROUP:_data_bottom[-1]
	mov	ax, 3505h
	int 21h
endif
	ret

;----------------------------------------
	ALIGN 4
nf0:	; No change from previous buffer
	mov	eax, DiffBufPtrs
	jmp	nf_shift

;----------------------------------------
	ALIGN 4
nf1:	; No change (and copied to screen)
	add	edi, SWIDTH
	retn

;----------------------------------------
	ALIGN 4
nf2:	; Near shift from older part of current buffer
	xor	eax, eax
	mov	al, [esi]
	inc	esi
	mov	ax, nfpk_ShiftP2[eax*2]
nf_xyc_shift:
	xor	ebx, ebx
	mov	bl, ah
	shl	eax, 24
	sar	eax, 24
	add	bl, 080h
	adc	bl, 080h
	sar	bl, 1
	add	eax, nfpk_ShiftY[ebx*4]
	jmp	nf_shift

;----------------------------------------
	ALIGN 4
nf3:	; Near shift from newer part of current buffer
	xor	eax, eax
	mov	al, [esi]
	inc	esi
	mov	ax, nfpk_ShiftP2[eax*2]
	neg	al
	neg	ah
	jmp	nf_xyc_shift

;----------------------------------------
	ALIGN 4
nf4:	; Near shift from previous buffer
	xor	eax, eax
	mov	al, [esi]
	inc	esi
	mov	ax, nfpk_ShiftP1[eax*2]
	jmp	nf_xyp_shift

;----------------------------------------
	ALIGN 4
nf5:	; Far shift from previous buffer
	mov	ax, [esi]
	add	esi, 2
nf_xyp_shift:
	xor	ebx, ebx
	mov	bl, ah
	shl	eax, 24
	sar	eax, 24
	add	bl, 080h
	adc	bl, 080h
	sar	bl, 1
	add	eax, nfpk_ShiftY[ebx*4]
	add	eax, DiffBufPtrs
	jmp	nf_shift

;----------------------------------------
	ALIGN 4

if COMPOPS

nf6:	; Run of no changes (must only appear in first nibble opcodes)
	; Next nibble k specifies 2k+4 squares with no changes
	add	esp, 4			; Next nibble is not an opcode
	add	ebx, 2			; (minimum of 4 squares)
	ALIGN 4
nf6a:	add	edi, SWIDTH*2		; Advance over two squares
	dec	ebx
	jz	nf6z			; Last pair of squares
	dec	wcnt			; Same row?
	jns	nf6a			;  Yes
	add	edi, new_row		; Advance to next row
	dec	h			; Decrement row count (should never become zero here)
	mov	eax, w			; Reset wcnt
	shr	eax ,1
	dec	eax
	mov	wcnt, eax
	jmp	nf6a

nf6z:	retn

else

nf6:	; Far shift from current buffer
	mov	ax, [esi]
	add	esi, 2
	jmp	nf_xyc_shift
endif

;----------------------------------------
	ALIGN 4
nf_shift:
if 0 ;debug
	mov	eax, 0
	mov	ebx, eax
	jmp	nf_solid
endif
	mov	ebx, esi	; save esi
	lea	esi, [edi+eax]
	mov	edx, nf_width

	REPEAT 3
	  mov eax, [esi]
	  mov [edi], eax
	  mov eax, [esi+4]
	  mov [edi+4], eax
	  add	esi, edx
	  add	edi, edx
	ENDM
	mov eax, [esi]
	mov [edi], eax
	mov eax, [esi+4]
	mov [edi+4], eax

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	mov	esi, ebx	; restore esi
	retn

;----------------------------------------
	ALIGN 4
nf7:	;     8x8x1 (10 bytes)

	mov	ax, [esi]
	cmp	al, ah
	ja	nf23

if 0 ;debug
	add	esi, 10
	mov	eax, 0fefefefeH
	mov	ebx, eax
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf7_11+2

	mov	al, [esi+2]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_11-nf7_11)], bl
	mov	[edx+(nf7_12-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_13-nf7_11)], bl
	mov	[edx+(nf7_14-nf7_11)], bh

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_31-nf7_11)], bl
	mov	[edx+(nf7_32-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_33-nf7_11)], bl
	mov	[edx+(nf7_34-nf7_11)], bh

	lea	edx, [edx+(nf7_51-nf7_11)]

	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_51-nf7_51)], bl
	mov	[edx+(nf7_52-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_53-nf7_51)], bl
	mov	[edx+(nf7_54-nf7_51)], bh

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_71-nf7_51)], bl
	mov	[edx+(nf7_72-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_73-nf7_51)], bl
	mov	[edx+(nf7_74-nf7_51)], bh

	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi,nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax
	jmp	nf7_0		; flush prefetch
	ALIGN 4
nf7_0:
nf7_11:	mov	ax, bx
	shl	eax, 16
nf7_12:	mov	ax, bx
	mov	[edi], eax
nf7_13:	mov	ax, bx
	shl	eax, 16
nf7_14:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_31:	mov	ax, bx
	shl	eax, 16
nf7_32:	mov	ax, bx
	mov	[edi], eax
nf7_33:	mov	ax, bx
	shl	eax, 16
nf7_34:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_51:	mov	ax, bx
	shl	eax, 16
nf7_52:	mov	ax, bx
	mov	[edi], eax
nf7_53:	mov	ax, bx
	shl	eax, 16
nf7_54:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_71:	mov	ax, bx
	shl	eax, 16
nf7_72:	mov	ax, bx
	mov	[edi], eax
nf7_73:	mov	ax, bx
	shl	eax, 16
nf7_74:	mov	ax, bx
	mov	[edi+4], eax

	pop	esi
	pop	ebp
	add	esi, 10
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf7+16
nf23:	; low 4x4x1 (4 bytes)

	xor	eax, eax
	lea	ecx, nfpk_mov4l
	lea	edx, byte ptr ds:nf23_11+2

	mov	al, [esi+2]
	and	al, 0fH
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_11-nf23_11)], bl
	mov	[edx+(nf23_12-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_13-nf23_11)], bl
	mov	[edx+(nf23_14-nf23_11)], bh

	mov	al, [esi+2]
	shr	al, 4
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_31-nf23_11)], bl
	mov	[edx+(nf23_32-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_33-nf23_11)], bl
	mov	[edx+(nf23_34-nf23_11)], bh


	mov	al, [esi+3]
	and	al, 0fH
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_51-nf23_11)], bl
	mov	[edx+(nf23_52-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_53-nf23_11)], bl
	mov	[edx+(nf23_54-nf23_11)], bh

	mov	al, [esi+3]
	shr	al, 4
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_71-nf23_11)], bl
	mov	[edx+(nf23_72-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_73-nf23_11)], bl
	mov	[edx+(nf23_74-nf23_11)], bh

	mov	edx, nf_width

	; load bx,cx with 00,11 color combinations
	mov	bx, [esi]
	mov	cl, bh
	mov	bh, bl
	mov	ch, cl

	jmp	nf23_0		; flush prefetch
	ALIGN 4
nf23_0:

nf23_11:mov	ax, bx
	shl	eax, 16
nf23_12:mov	ax, bx
	mov	[edi], eax

nf23_13:mov	ax, bx
	shl	eax, 16
nf23_14:mov	ax, bx
	mov	[edi+4], eax
	add	edi, edx

nf23_31:mov	ax, bx
	shl	eax, 16
nf23_32:mov	ax, bx
	mov	[edi], eax

nf23_33:mov	ax, bx
	shl	eax, 16
nf23_34:mov	ax, bx
	mov	[edi+4], eax
	add	edi, edx

nf23_51:mov	ax, bx
	shl	eax, 16
nf23_52:mov	ax, bx
	mov	[edi], eax

nf23_53:mov	ax, bx
	shl	eax, 16
nf23_54:mov	ax, bx
	mov	[edi+4], eax
	add	edi, edx

nf23_71:mov	ax, bx
	shl	eax, 16
nf23_72:mov	ax, bx
	mov	[edi], eax

nf23_73:mov	ax, bx
	shl	eax, 16
nf23_74:mov	ax, bx
	mov	[edi+4], eax

	sub	edi, nfpk_back_right
	add	esi, 4
	retn

;----------------------------------------
	ALIGN 4
nf8:	; 2x2 4x4x1 (16 bytes)

	mov	ax, [esi]
	cmp	al, ah
	ja	nf24

	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf8_11+2	

	mov	al, [esi+2]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_11-nf8_11)], bl
	mov	[edx+(nf8_12-nf8_11)], bh

	mov	al, [esi+3]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_21-nf8_11)], bl
	mov	[edx+(nf8_22-nf8_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_31-nf8_11)], bl
	mov	[edx+(nf8_32-nf8_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_41-nf8_11)], bl
	mov	[edx+(nf8_42-nf8_11)], bh

	add	edx, nf8_51-nf8_11

	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_51-nf8_51)], bl
	mov	[edx+(nf8_52-nf8_51)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_61-nf8_51)], bl
	mov	[edx+(nf8_62-nf8_51)], bh


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_71-nf8_51)], bl
	mov	[edx+(nf8_72-nf8_51)], bh

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_81-nf8_51)], bl
	mov	[edx+(nf8_82-nf8_51)], bh


	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi, nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

	jmp	nf8_0		; flush prefetch
	ALIGN 4
nf8_0:
nf8_11:	mov	ax, bx
	shl	eax, 16
nf8_12:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_21:	mov	ax, bx
	shl	eax, 16
nf8_22:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	mov	eax, [esp]
	mov	cx, [eax+4]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf8_31:	mov	ax, bx
	shl	eax, 16
nf8_32:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_41:	mov	ax, bx
	shl	eax, 16
nf8_42:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	lea	eax, [esi*4-4]
	sub	edi, eax

	mov	eax, [esp]
	mov	cx, [eax+8]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf8_51:	mov	ax, bx
	shl	eax, 16
nf8_52:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_61:	mov	ax, bx
	shl	eax, 16
nf8_62:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	mov	eax, [esp]
	mov	cx, [eax+12]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf8_71:	mov	ax, bx
	shl	eax, 16
nf8_72:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_81:	mov	ax, bx
	shl	eax, 16
nf8_82:	mov	ax, bx
	mov	[edi], eax

	pop	esi
	pop	ebp
	add	esi, 16
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf8+16
nf24:	; 2x1 4x8x1 (12 bytes)

	mov	ax, [esi+6]
	cmp	al, ah
	ja	nf40

	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf24_11+2	

	mov	al, [esi+2]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_11-nf24_11)], bl
	mov	[edx+(nf24_12-nf24_11)], bh

	mov	al, [esi+3]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_21-nf24_11)], bl
	mov	[edx+(nf24_22-nf24_11)], bh

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_31-nf24_11)], bl
	mov	[edx+(nf24_32-nf24_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_41-nf24_11)], bl
	mov	[edx+(nf24_42-nf24_11)], bh

	add	edx, nf24_51-nf24_11

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_51-nf24_51)], bl
	mov	[edx+(nf24_52-nf24_51)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_61-nf24_51)], bl
	mov	[edx+(nf24_62-nf24_51)], bh

	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_71-nf24_51)], bl
	mov	[edx+(nf24_72-nf24_51)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_81-nf24_51)], bl
	mov	[edx+(nf24_82-nf24_51)], bh

	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi, nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

	jmp	nf24_0		; flush prefetch
	ALIGN 4
nf24_0:
nf24_11:mov	ax, bx
	shl	eax, 16
nf24_12:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_21:mov	ax, bx
	shl	eax, 16
nf24_22:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_31:mov	ax, bx
	shl	eax, 16
nf24_32:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_41:mov	ax, bx
	shl	eax, 16
nf24_42:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	lea	eax, [esi*4-4]
	sub	edi, eax

	mov	eax, [esp]
	mov	cx, [eax+6]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf24_51:mov	ax, bx
	shl	eax, 16
nf24_52:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_61:mov	ax, bx
	shl	eax, 16
nf24_62:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_71:mov	ax, bx
	shl	eax, 16
nf24_72:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_81:mov	ax, bx
	shl	eax, 16
nf24_82:mov	ax, bx
	mov	[edi], eax

	pop	esi
	pop	ebp
	add	esi, 12
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf8+32
nf40:	; 1x2 8x4x1 (12 bytes)
	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf40_11+2	

	mov	al, [esi+2]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_11-nf40_11)], bl
	mov	[edx+(nf40_12-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_13-nf40_11)], bl
	mov	[edx+(nf40_14-nf40_11)], bh

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_31-nf40_11)], bl
	mov	[edx+(nf40_32-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_33-nf40_11)], bl
	mov	[edx+(nf40_34-nf40_11)], bh

	add	edx, nf40_51-nf40_11

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_51-nf40_51)], bl
	mov	[edx+(nf40_52-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_53-nf40_51)], bl
	mov	[edx+(nf40_54-nf40_51)], bh

	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_71-nf40_51)], bl
	mov	[edx+(nf40_72-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_73-nf40_51)], bl
	mov	[edx+(nf40_74-nf40_51)], bh

	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi, nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

	jmp	nf40_0		; flush prefetch
	ALIGN 4
nf40_0:
nf40_11:mov	ax, bx
	shl	eax, 16
nf40_12:mov	ax, bx
	mov	[edi], eax
nf40_13:mov	ax, bx
	shl	eax, 16
nf40_14:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf40_31:mov	ax, bx
	shl	eax, 16
nf40_32:mov	ax, bx
	mov	[edi], eax
nf40_33:mov	ax, bx
	shl	eax, 16
nf40_34:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

	mov	eax, [esp]
	mov	cx, [eax+6]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf40_51:mov	ax, bx
	shl	eax, 16
nf40_52:mov	ax, bx
	mov	[edi], eax
nf40_53:mov	ax, bx
	shl	eax, 16
nf40_54:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf40_71:mov	ax, bx
	shl	eax, 16
nf40_72:mov	ax, bx
	mov	[edi], eax
nf40_73:mov	ax, bx
	shl	eax, 16
nf40_74:mov	ax, bx
	mov	[edi+4], eax

	pop	esi
	pop	ebp
	add	esi, 12
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
nf9:	;     8x8x2 (20 bytes)

	mov	eax, [esi]
	cmp	al, ah
	ja	nf41

	shr	eax, 16
	cmp	al, ah
	ja	nf25

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf9_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_11-nf9_11)], bl
	mov	[edx+(nf9_12-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_13-nf9_11)], bl
	mov	[edx+(nf9_14-nf9_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_15-nf9_11)], bl
	mov	[edx+(nf9_16-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_17-nf9_11)], bl
	mov	[edx+(nf9_18-nf9_11)], bh


	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_31-nf9_11)], bl
	mov	[edx+(nf9_32-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_33-nf9_11)], bl
	mov	[edx+(nf9_34-nf9_11)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_35-nf9_11)], bl
	mov	[edx+(nf9_36-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_37-nf9_11)], bl
	mov	[edx+(nf9_38-nf9_11)], bh

	lea	edx, [edx+(nf9_51-nf9_11)]

	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_51-nf9_51)], bl
	mov	[edx+(nf9_52-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_53-nf9_51)], bl
	mov	[edx+(nf9_54-nf9_51)], bh

	mov	al, [esi+13]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_55-nf9_51)], bl
	mov	[edx+(nf9_56-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_57-nf9_51)], bl
	mov	[edx+(nf9_58-nf9_51)], bh


	mov	al, [esi+16]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_71-nf9_51)], bl
	mov	[edx+(nf9_72-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_73-nf9_51)], bl
	mov	[edx+(nf9_74-nf9_51)], bh

	mov	al, [esi+17]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_75-nf9_51)], bl
	mov	[edx+(nf9_76-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_77-nf9_51)], bl
	mov	[edx+(nf9_78-nf9_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf9_0		; flush prefetch
	ALIGN 4
nf9_0:
nf9_11:	mov	al, bl
nf9_12:	mov	ah, bl
	shl	eax, 16
nf9_13:	mov	al, bl
nf9_14:	mov	ah, bl
	mov	[edi], eax

nf9_15:	mov	al, bl
nf9_16:	mov	ah, bl
	shl	eax, 16
nf9_17:	mov	al, bl
nf9_18:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_31:	mov	al, bl
nf9_32:	mov	ah, bl
	shl	eax, 16
nf9_33:	mov	al, bl
nf9_34:	mov	ah, bl
	mov	[edi], eax

nf9_35:	mov	al, bl
nf9_36:	mov	ah, bl
	shl	eax, 16
nf9_37:	mov	al, bl
nf9_38:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_51:	mov	al, bl
nf9_52:	mov	ah, bl
	shl	eax, 16
nf9_53:	mov	al, bl
nf9_54:	mov	ah, bl
	mov	[edi], eax

nf9_55:	mov	al, bl
nf9_56:	mov	ah, bl
	shl	eax, 16
nf9_57:	mov	al, bl
nf9_58:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_71:	mov	al, bl
nf9_72:	mov	ah, bl
	shl	eax, 16
nf9_73:	mov	al, bl
nf9_74:	mov	ah, bl
	mov	[edi], eax

nf9_75:	mov	al, bl
nf9_76:	mov	ah, bl
	shl	eax, 16
nf9_77:	mov	al, bl
nf9_78:	mov	ah, bl
	mov	[edi+4], eax

	add	esi, 20
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
;nf9+16
nf25:	; low 4x4x2 (8 bytes)

if 0 ;debug
	mov	eax, 0
	mov	ebx, 0
	add	esi, 8
	jmp	nf_solid
endif

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf25_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_14-nf25_11)], bl
	mov	[edx+(nf25_13-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_12-nf25_11)], bl
	mov	[edx+(nf25_11-nf25_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_24-nf25_11)], bl
	mov	[edx+(nf25_23-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_22-nf25_11)], bl
	mov	[edx+(nf25_21-nf25_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_34-nf25_11)], bl
	mov	[edx+(nf25_33-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_32-nf25_11)], bl
	mov	[edx+(nf25_31-nf25_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_44-nf25_11)], bl
	mov	[edx+(nf25_43-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_42-nf25_11)], bl
	mov	[edx+(nf25_41-nf25_11)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf25_0		; flush prefetch
	ALIGN 4
nf25_0:
nf25_11:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_12:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf25_13:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_14:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf25_21:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_22:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf25_23:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_24:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf25_31:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_32:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf25_33:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_34:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf25_41:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_42:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf25_43:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_44:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax

	add	esi, 8
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf9+32
nf41:	; low 4x8x2 (12 bytes)
	shr	eax, 16
	cmp	al, ah
	ja	nf57

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf41_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_14-nf41_11)], bl
	mov	[edx+(nf41_13-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_12-nf41_11)], bl
	mov	[edx+(nf41_11-nf41_11)], bh

	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_34-nf41_11)], bl
	mov	[edx+(nf41_33-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_32-nf41_11)], bl
	mov	[edx+(nf41_31-nf41_11)], bh

	lea	edx, [edx+(nf41_51-nf41_11)]

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_54-nf41_51)], bl
	mov	[edx+(nf41_53-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_52-nf41_51)], bl
	mov	[edx+(nf41_51-nf41_51)], bh

	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_74-nf41_51)], bl
	mov	[edx+(nf41_73-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_72-nf41_51)], bl
	mov	[edx+(nf41_71-nf41_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf41_0		; flush prefetch
	ALIGN 4
nf41_0:
nf41_11:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_12:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_13:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_14:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf41_31:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_32:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_33:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_34:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf41_51:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_52:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_53:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_54:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf41_71:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_72:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf41_73:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf41_74:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax

	add	esi, 12
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf9+48
nf57:	; low 8x4x2 (12 bytes)
	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf57_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_11-nf57_11)], bl
	mov	[edx+(nf57_12-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_13-nf57_11)], bl
	mov	[edx+(nf57_14-nf57_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_15-nf57_11)], bl
	mov	[edx+(nf57_16-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_17-nf57_11)], bl
	mov	[edx+(nf57_18-nf57_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_21-nf57_11)], bl
	mov	[edx+(nf57_22-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_23-nf57_11)], bl
	mov	[edx+(nf57_24-nf57_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_25-nf57_11)], bl
	mov	[edx+(nf57_26-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_27-nf57_11)], bl
	mov	[edx+(nf57_28-nf57_11)], bh


	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_31-nf57_11)], bl
	mov	[edx+(nf57_32-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_33-nf57_11)], bl
	mov	[edx+(nf57_34-nf57_11)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_35-nf57_11)], bl
	mov	[edx+(nf57_36-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_37-nf57_11)], bl
	mov	[edx+(nf57_38-nf57_11)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_41-nf57_11)], bl
	mov	[edx+(nf57_42-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_43-nf57_11)], bl
	mov	[edx+(nf57_44-nf57_11)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_45-nf57_11)], bl
	mov	[edx+(nf57_46-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_47-nf57_11)], bl
	mov	[edx+(nf57_48-nf57_11)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf57_0		; flush prefetch
	ALIGN 4
nf57_0:
nf57_11:mov	al, bl
nf57_12:mov	ah, bl
	shl	eax, 16
nf57_13:mov	al, bl
nf57_14:mov	ah, bl
	mov	[edi], eax

nf57_15:mov	al, bl
nf57_16:mov	ah, bl
	shl	eax, 16
nf57_17:mov	al, bl
nf57_18:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf57_21:mov	al, bl
nf57_22:mov	ah, bl
	shl	eax, 16
nf57_23:mov	al, bl
nf57_24:mov	ah, bl
	mov	[edi], eax

nf57_25:mov	al, bl
nf57_26:mov	ah, bl
	shl	eax, 16
nf57_27:mov	al, bl
nf57_28:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf57_31:mov	al, bl
nf57_32:mov	ah, bl
	shl	eax, 16
nf57_33:mov	al, bl
nf57_34:mov	ah, bl
	mov	[edi], eax

nf57_35:mov	al, bl
nf57_36:mov	ah, bl
	shl	eax, 16
nf57_37:mov	al, bl
nf57_38:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf57_41:mov	al, bl
nf57_42:mov	ah, bl
	shl	eax, 16
nf57_43:mov	al, bl
nf57_44:mov	ah, bl
	mov	[edi], eax

nf57_45:mov	al, bl
nf57_46:mov	ah, bl
	shl	eax, 16
nf57_47:mov	al, bl
nf57_48:mov	ah, bl
	mov	[edi+4], eax

	add	esi, 12
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
nf10:	; 2x2 4x4x2 (32 bytes)

	mov	ax, [esi]
	cmp	al, ah
	ja	nf26

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf10_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_11-nf10_11)], bl
	mov	[edx+(nf10_12-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_13-nf10_11)], bl
	mov	[edx+(nf10_14-nf10_11)], bh

	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_21-nf10_11)], bl
	mov	[edx+(nf10_22-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_23-nf10_11)], bl
	mov	[edx+(nf10_24-nf10_11)], bh

	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_31-nf10_11)], bl
	mov	[edx+(nf10_32-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_33-nf10_11)], bl
	mov	[edx+(nf10_34-nf10_11)], bh

	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_41-nf10_11)], bl
	mov	[edx+(nf10_42-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_43-nf10_11)], bl
	mov	[edx+(nf10_44-nf10_11)], bh

	lea	edx, [edx+(nf10_51-nf10_11)]

	mov	al, [esi+20]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_51-nf10_51)], bl
	mov	[edx+(nf10_52-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_53-nf10_51)], bl
	mov	[edx+(nf10_54-nf10_51)], bh

	mov	al, [esi+22]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_61-nf10_51)], bl
	mov	[edx+(nf10_62-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_63-nf10_51)], bl
	mov	[edx+(nf10_64-nf10_51)], bh

	mov	al, [esi+28]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_71-nf10_51)], bl
	mov	[edx+(nf10_72-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_73-nf10_51)], bl
	mov	[edx+(nf10_74-nf10_51)], bh

	mov	al, [esi+30]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_81-nf10_51)], bl
	mov	[edx+(nf10_82-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_83-nf10_51)], bl
	mov	[edx+(nf10_84-nf10_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf10_0		; flush prefetch
	ALIGN 4
nf10_0:
nf10_11:mov	al, bl
nf10_12:mov	ah, bl
	shl	eax, 16
nf10_13:mov	al, bl
nf10_14:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_21:mov	al, bl
nf10_22:mov	ah, bl
	shl	eax, 16
nf10_23:mov	al, bl
nf10_24:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+8]
	mov	cx, [esi+10]

nf10_31:mov	al, bl
nf10_32:mov	ah, bl
	shl	eax, 16
nf10_33:mov	al, bl
nf10_34:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_41:mov	al, bl
nf10_42:mov	ah, bl
	shl	eax, 16
nf10_43:mov	al, bl
nf10_44:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	lea	eax, [edx*4-4]
	sub	edi, eax

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+16]
	mov	cx, [esi+18]

nf10_51:mov	al, bl
nf10_52:mov	ah, bl
	shl	eax, 16
nf10_53:mov	al, bl
nf10_54:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_61:mov	al, bl
nf10_62:mov	ah, bl
	shl	eax, 16
nf10_63:mov	al, bl
nf10_64:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+24]
	mov	cx, [esi+26]

nf10_71:mov	al, bl
nf10_72:mov	ah, bl
	shl	eax, 16
nf10_73:mov	al, bl
nf10_74:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_81:mov	al, bl
nf10_82:mov	ah, bl
	shl	eax, 16
nf10_83:mov	al, bl
nf10_84:mov	ah, bl
	mov	[edi], eax

	add	esi, 32
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf10+16
nf26:	; 2x1 4x8x2 (24 bytes)

	mov	ax, [esi+12]
	cmp	al, ah
	ja	nf42

if 0 ;debug
	mov	eax, 0
	mov	ebx, 0
	add	esi, 24
	jmp	nf_solid
endif

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf26_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_11-nf26_11)], bl
	mov	[edx+(nf26_12-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_13-nf26_11)], bl
	mov	[edx+(nf26_14-nf26_11)], bh

	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_21-nf26_11)], bl
	mov	[edx+(nf26_22-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_23-nf26_11)], bl
	mov	[edx+(nf26_24-nf26_11)], bh

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_31-nf26_11)], bl
	mov	[edx+(nf26_32-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_33-nf26_11)], bl
	mov	[edx+(nf26_34-nf26_11)], bh

	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_41-nf26_11)], bl
	mov	[edx+(nf26_42-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_43-nf26_11)], bl
	mov	[edx+(nf26_44-nf26_11)], bh

	lea	edx, [edx+(nf26_51-nf26_11)]

	mov	al, [esi+16]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_51-nf26_51)], bl
	mov	[edx+(nf26_52-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_53-nf26_51)], bl
	mov	[edx+(nf26_54-nf26_51)], bh

	mov	al, [esi+18]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_61-nf26_51)], bl
	mov	[edx+(nf26_62-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_63-nf26_51)], bl
	mov	[edx+(nf26_64-nf26_51)], bh

	mov	al, [esi+20]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_71-nf26_51)], bl
	mov	[edx+(nf26_72-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_73-nf26_51)], bl
	mov	[edx+(nf26_74-nf26_51)], bh

	mov	al, [esi+22]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_81-nf26_51)], bl
	mov	[edx+(nf26_82-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_83-nf26_51)], bl
	mov	[edx+(nf26_84-nf26_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf26_0		; flush prefetch
	ALIGN 4
nf26_0:
nf26_11:mov	al, bl
nf26_12:mov	ah, bl
	shl	eax, 16
nf26_13:mov	al, bl
nf26_14:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_21:mov	al, bl
nf26_22:mov	ah, bl
	shl	eax, 16
nf26_23:mov	al, bl
nf26_24:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_31:mov	al, bl
nf26_32:mov	ah, bl
	shl	eax, 16
nf26_33:mov	al, bl
nf26_34:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_41:mov	al, bl
nf26_42:mov	ah, bl
	shl	eax, 16
nf26_43:mov	al, bl
nf26_44:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	lea	eax, [edx*4-4]
	sub	edi, eax

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+12]
	mov	cx, [esi+14]

nf26_51:mov	al, bl
nf26_52:mov	ah, bl
	shl	eax, 16
nf26_53:mov	al, bl
nf26_54:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_61:mov	al, bl
nf26_62:mov	ah, bl
	shl	eax, 16
nf26_63:mov	al, bl
nf26_64:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_71:mov	al, bl
nf26_72:mov	ah, bl
	shl	eax, 16
nf26_73:mov	al, bl
nf26_74:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_81:mov	al, bl
nf26_82:mov	ah, bl
	shl	eax, 16
nf26_83:mov	al, bl
nf26_84:mov	ah, bl
	mov	[edi], eax

	add	esi, 24
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf10+32
nf42:	; 1x2 8x4x2 (24 bytes)

if 0 ;debug
	mov	eax, 0
	mov	ebx, 0
	add	esi, 24
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf42_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_11-nf42_11)], bl
	mov	[edx+(nf42_12-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_13-nf42_11)], bl
	mov	[edx+(nf42_14-nf42_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_15-nf42_11)], bl
	mov	[edx+(nf42_16-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_17-nf42_11)], bl
	mov	[edx+(nf42_18-nf42_11)], bh


	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_31-nf42_11)], bl
	mov	[edx+(nf42_32-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_33-nf42_11)], bl
	mov	[edx+(nf42_34-nf42_11)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_35-nf42_11)], bl
	mov	[edx+(nf42_36-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_37-nf42_11)], bl
	mov	[edx+(nf42_38-nf42_11)], bh

	lea	edx, [edx+(nf42_51-nf42_11)]

	mov	al, [esi+16]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_51-nf42_51)], bl
	mov	[edx+(nf42_52-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_53-nf42_51)], bl
	mov	[edx+(nf42_54-nf42_51)], bh

	mov	al, [esi+17]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_55-nf42_51)], bl
	mov	[edx+(nf42_56-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_57-nf42_51)], bl
	mov	[edx+(nf42_58-nf42_51)], bh


	mov	al, [esi+20]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_71-nf42_51)], bl
	mov	[edx+(nf42_72-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_73-nf42_51)], bl
	mov	[edx+(nf42_74-nf42_51)], bh

	mov	al, [esi+21]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_75-nf42_51)], bl
	mov	[edx+(nf42_76-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_77-nf42_51)], bl
	mov	[edx+(nf42_78-nf42_51)], bh


	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf42_0		; flush prefetch
	ALIGN 4
nf42_0:
nf42_11:mov	al, bl
nf42_12:mov	ah, bl
	shl	eax, 16
nf42_13:mov	al, bl
nf42_14:mov	ah, bl
	mov	[edi], eax

nf42_15:mov	al, bl
nf42_16:mov	ah, bl
	shl	eax, 16
nf42_17:mov	al, bl
nf42_18:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf42_31:mov	al, bl
nf42_32:mov	ah, bl
	shl	eax, 16
nf42_33:mov	al, bl
nf42_34:mov	ah, bl
	mov	[edi], eax

nf42_35:mov	al, bl
nf42_36:mov	ah, bl
	shl	eax, 16
nf42_37:mov	al, bl
nf42_38:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+12]
	mov	cx, [esi+14]

nf42_51:mov	al, bl
nf42_52:mov	ah, bl
	shl	eax, 16
nf42_53:mov	al, bl
nf42_54:mov	ah, bl
	mov	[edi], eax

nf42_55:mov	al, bl
nf42_56:mov	ah, bl
	shl	eax, 16
nf42_57:mov	al, bl
nf42_58:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf42_71:mov	al, bl
nf42_72:mov	ah, bl
	shl	eax, 16
nf42_73:mov	al, bl
nf42_74:mov	ah, bl
	mov	[edi], eax

nf42_75:mov	al, bl
nf42_76:mov	ah, bl
	shl	eax, 16
nf42_77:mov	al, bl
nf42_78:mov	ah, bl
	mov	[edi+4], eax

	add	esi, 24
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
nf11:	;     8x8x8 (64 bytes)
if 0 ;debug
	add	esi, 64
	mov	eax, 0fefefefeH
;	mov	ebx, eax
	mov	ebx, 0
	jmp	nf_solid
endif
	mov	edx, nf_width

	mov eax, [esi]		;0
	mov [edi], eax
	mov eax, [esi+4]
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+16]	;2
	mov [edi], eax
	mov eax, [esi+20]
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+32]	;4
	mov [edi], eax
	mov eax, [esi+36]
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+48]	;6
	mov [edi], eax
	mov eax, [esi+52]
	mov [edi+4], eax

	add	esi, 64
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	retn

;----------------------------------------
	ALIGN 4
nf12:	; low 4x4x8 (16 bytes)
	mov	edx, nf_width

	mov	eax, [esi]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx
	add	edi, edx

	mov	eax, [esi+4]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx
	add	edi, edx

	mov	eax, [esi+8]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx
	add	edi, edx

	mov	eax, [esi+12]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx

	sub	edi, nfpk_back_right
	add	esi, 16
	retn

;----------------------------------------
	ALIGN 4
nf13:	; 2x2 4x4x0 (4 bytes)
	mov	edx, nf_width

	mov	cl, [esi]
	mov	ch, cl
	mov	eax, ecx
	shl	eax, 16
	mov	ax, cx

	mov	cl, [esi+1]
	mov	ch, cl
	mov	ebx, ecx
	shl	ebx, 16
	mov	bx, cx

	mov	[edi], eax
	mov	[edi+4], ebx
	mov	[edi+edx], eax
	mov	[edi+edx+4], ebx
	lea	edi, [edi+edx*2]

	mov	cl, [esi+2]
	mov	ch, cl
	mov	eax, ecx
	shl	eax, 16
	mov	ax, cx

	mov	cl, [esi+3]
	mov	ch, cl
	mov	ebx, ecx
	shl	ebx, 16
	mov	bx, cx

	mov	[edi], eax
	mov	[edi+4], ebx
	add	edi, edx
	mov	[edi], eax
	mov	[edi+4], ebx

	sub	edi, nfpk_back_right
	add	esi, 4

	retn

;----------------------------------------
	ALIGN 4
nf14:	;     8x8x0 (1 byte)
	mov	bl, [esi]		; Copy color into 8 positions
	inc	esi
	mov	bh, bl
	mov	eax, ebx
	shl	eax, 16
	mov	ax, bx
	mov	ebx, eax
if 0 ;debug
	mov	eax, 080808080h
	mov	ebx, eax
endif
	jmp	nf_solid

	retn

;----------------------------------------
	ALIGN 4
nf15:	; mix 8x8x0 (2 bytes)
	mov	bx, [esi]		; Copy 2 colors into 8 positions
	add	esi, 2			;  in a checkerboard
	mov	ax, bx
	shl	eax, 16
	mov	ax, bx
	mov	ebx, eax
	rol	ebx, 8
if 0 ;debug
	mov	eax, 080808080h
	mov	ebx, eax
endif
nf_solid:
	mov	edx, nf_width

	mov	[edi], eax
	mov	[edi+4], eax
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	add	edi, edx
	mov	[edi], eax
	mov	[edi+4], eax
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

nfPkDecompH ENDP

if DECOMPD

; Half vertical resolution version (dither between lines)
;
nfPkDecompD PROC USES ESI EDI EBX, \
		ops:PTRBYTE, comp:PTRBYTE, \
		x:DWORD, y:DWORD, w:DWORD, h:DWORD
	LOCAL tbuf: PTRBYTE
	LOCAL new_row:DWORD
	LOCAL DiffBufPtrs:DWORD

	LOCAL nfpk_back_right: DWORD
	LOCAL wcnt:DWORD

	LOG_LABEL "StartPkDecomp"

.data
nfpk_OpTblD label dword
	dword offset nf0	; Prev Same (0)
	dword offset nf1	; No change (and copied to screen) (0)
	dword offset nf2	; Near shift from older part of current buf (1)
	dword offset nf3	; Near shift from newer part of current buf (1)
	dword offset nf4	; Near shift from previous buffer (1)
	dword offset nf5	; Far shift from previous buffer (2)
	dword offset nf6	; Far shift from current buffer (2)
				; [Or if COMPOPS, run of no changes (0)]
	dword offset nf7	;     8x8x1 (10 bytes) or low 4x4x1 (4 bytes)
	dword offset nf8	; 2x2 4x4x1 (16 bytes) or 2x1 4x8x1 (12 bytes) or 1x2 8x4x1 (12 bytes)
	dword offset nf9	;     8x8x2 (20 bytes) or low 4x4x2 (8 bytes) or
				; low 4x8x2 (12 bytes) or low 8x4x2 (12 bytes)
	dword offset nf10	; 2x2 4x4x2 (32 bytes) or 2x1 4x8x2 (24 bytes) or 1x2 4x8x2 (24 bytes)
	dword offset nf11	;     8x8x8 (64 bytes)
	dword offset nf12	; low 4x4x8 (16 bytes)
	dword offset nf13	; 2x2 4x4x0 (ie 2x2x8) (4 bytes)
	dword offset nf14	;     8x8x0 (1 byte)
	dword offset nf15	; mix 8x8x0 (2 bytes)
.code

ifdef SYMANTEC
	mov	ebx, ds		; Allow DS to access code
	mov	ecx, 0
	mov	ax, 3505h
	int 21h
endif

	NF_DECOMP_INIT	0

	mov	eax, nf_width
	shl	eax, 2
	sub	eax, nf_new_w
	mov	new_row, eax

	shr	nf_new_h, 1

	mov	eax, nf_width
	lea	eax, [eax*2+eax-SWIDTH]
	mov	nfpk_back_right, eax

	mov	esi, comp
	mov	edi, tbuf
nf_StartRow:
	mov	eax, w
	shr	eax, 1
	mov	wcnt,eax
	ALIGN 4
nf_NextPair:
	dec	wcnt
	js	nf_NextRow
	mov	ebx, ops
	mov	al, [ebx]
	inc	ebx
	mov	ops, ebx

	xor	ebx, ebx
	mov	bl, al
	shr	bl, 4
	and	eax, 0Fh
	push	offset nf_NextPair
	push	nfpk_OpTblD[ebx*4]
	jmp	nfpk_OpTblD[eax*4]

nf_NextRow:
	add	edi, new_row
	dec	h
	jnz	nf_StartRow
	LOG_LABEL "EndPkDecomp"

ifdef SYMANTEC
	mov	ebx, ds		; Disable DS from accessing code
	mov	ecx, offset DGROUP:_data_bottom[-1]
	mov	ax, 3505h
	int 21h
endif
	ret

;----------------------------------------
	ALIGN 4
nf0:	; No change from previous buffer
	mov	eax, DiffBufPtrs
	jmp	nf_shiftr

;----------------------------------------
	ALIGN 4
nf1:	; No change (and copied to screen)
	add	edi, SWIDTH
	retn

;----------------------------------------
	ALIGN 4
nf2:	; Near shift from older part of current buffer
	xor	eax, eax
	mov	al, [esi]
	inc	esi
	mov	ax, nfpk_ShiftP2[eax*2]
nf_xyc_shift:
	xor	ebx, ebx
	mov	bl, ah
	shl	eax, 24
	sar	eax, 24
	sar	bl, 1
	pushf
	add	eax, nfpk_ShiftY[ebx*4]
	jmp	nf_shift

;----------------------------------------
	ALIGN 4
nf3:	; Near shift from newer part of current buffer
	xor	eax, eax
	mov	al, [esi]
	inc	esi
	mov	ax, nfpk_ShiftP2[eax*2]
	neg	al
	neg	ah
	jmp	nf_xyc_shift

;----------------------------------------
	ALIGN 4
nf4:	; Near shift from previous buffer
	xor	eax, eax
	mov	al, [esi]
	inc	esi
	mov	ax, nfpk_ShiftP1[eax*2]
	jmp	nf_xyp_shift

;----------------------------------------
	ALIGN 4
nf5:	; Far shift from previous buffer
	mov	ax, [esi]
	add	esi, 2
nf_xyp_shift:
	xor	ebx, ebx
	mov	bl, ah
	shl	eax, 24
	sar	eax, 24
	sar	bl, 1
	pushf
	add	eax, nfpk_ShiftY[ebx*4]
	add	eax, DiffBufPtrs
	jmp	nf_shift

;----------------------------------------
	ALIGN 4

if COMPOPS

nf6:	; Run of no changes (must only appear in first nibble opcodes)
	; Next nibble k specifies 2k+4 squares with no changes
	add	esp, 4			; Next nibble is not an opcode
	add	ebx, 2			; (minimum of 4 squares)
	ALIGN 4
nf6a:	add	edi, SWIDTH*2		; Advance over two squares
	dec	ebx
	jz	nf6z			; Last pair of squares
	dec	wcnt			; Same row?
	jns	nf6a			;  Yes
	add	edi, new_row		; Advance to next row
	dec	h			; Decrement row count (should never become zero here)
	mov	eax, w			; Reset wcnt
	shr	eax ,1
	dec	eax
	mov	wcnt, eax
	jmp	nf6a

nf6z:	retn

else

nf6:	; Far shift from current buffer
	mov	ax, [esi]
	add	esi, 2
	jmp	nf_xyc_shift

endif

;----------------------------------------
	ALIGN 4
nf_shift:
	popf
if 0 ;debug
	mov	eax, 0
	mov	ebx, eax
	jmp	nf_solid
endif
	jc	nf_shiftd

nf_shiftr:
	mov	ebx, esi	; save esi
	lea	esi, [edi+eax]
	mov	edx, nf_width

	REPEAT 3
	  mov eax, [esi]
	  mov [edi], eax
	  mov eax, [esi+4]
	  mov [edi+4], eax
	  add	esi, edx
	  add	edi, edx
	ENDM
	mov eax, [esi]
	mov [edi], eax
	mov eax, [esi+4]
	mov [edi+4], eax

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	mov	esi, ebx	; restore esi
	retn

nf_shiftd:
	push	esi
	lea	esi, [edi+eax]
	mov	edx, nf_width
	mov	ebx, 000ff00ffH

	REPEAT 3
	  mov eax, [esi]
	  mov ecx, eax
	  xor ecx, [esi+edx]
	  and ecx, ebx
	  xor eax, ecx
	  mov [edi], eax
	  mov eax, [esi+4]
	  mov ecx, eax
	  xor ecx, [esi+edx+4]
	  and ecx, ebx
	  xor eax, ecx
	  mov [edi+4], eax
	  add	esi, edx
	  add	edi, edx
	ENDM
	mov eax, [esi]
	mov ecx, eax
	xor ecx, [esi+edx]
	and ecx, ebx
	xor eax, ecx
	mov [edi], eax
	mov eax, [esi+4]
	mov ecx, eax
	xor ecx, [esi+edx+4]
	and ecx, ebx
	xor eax, ecx
	mov [edi+4], eax

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	pop	esi
	retn

;----------------------------------------
	ALIGN 4
nf7:	;     8x8x1 (10 bytes)

	mov	ax, [esi]
	cmp	al, ah
	ja	nf23

if 0 ;debug
	add	esi, 10
	mov	eax, 0fefefefeH
	mov	ebx, eax
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf7_11+2

	mov	al, [esi+2]
	mov	bl, al
	xor	bl, [esi+3]
	and	bl, 0aaH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_11-nf7_11)], bl
	mov	[edx+(nf7_12-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_13-nf7_11)], bl
	mov	[edx+(nf7_14-nf7_11)], bh

	mov	al, [esi+4]
	mov	bl, al
	xor	bl, [esi+5]
	and	bl, 0aaH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_31-nf7_11)], bl
	mov	[edx+(nf7_32-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_33-nf7_11)], bl
	mov	[edx+(nf7_34-nf7_11)], bh

	lea	edx, [edx+(nf7_51-nf7_11)]

	mov	al, [esi+6]
	mov	bl, al
	xor	bl, [esi+7]
	and	bl, 0aaH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_51-nf7_51)], bl
	mov	[edx+(nf7_52-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_53-nf7_51)], bl
	mov	[edx+(nf7_54-nf7_51)], bh

	mov	al, [esi+8]
	mov	bl, al
	xor	bl, [esi+9]
	and	bl, 0aaH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_71-nf7_51)], bl
	mov	[edx+(nf7_72-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_73-nf7_51)], bl
	mov	[edx+(nf7_74-nf7_51)], bh

	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi,nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax
	jmp	nf7_0		; flush prefetch
	ALIGN 4
nf7_0:
nf7_11:	mov	ax, bx
	shl	eax, 16
nf7_12:	mov	ax, bx
	mov	[edi], eax
nf7_13:	mov	ax, bx
	shl	eax, 16
nf7_14:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_31:	mov	ax, bx
	shl	eax, 16
nf7_32:	mov	ax, bx
	mov	[edi], eax
nf7_33:	mov	ax, bx
	shl	eax, 16
nf7_34:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_51:	mov	ax, bx
	shl	eax, 16
nf7_52:	mov	ax, bx
	mov	[edi], eax
nf7_53:	mov	ax, bx
	shl	eax, 16
nf7_54:	mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf7_71:	mov	ax, bx
	shl	eax, 16
nf7_72:	mov	ax, bx
	mov	[edi], eax
nf7_73:	mov	ax, bx
	shl	eax, 16
nf7_74:	mov	ax, bx
	mov	[edi+4], eax

	pop	esi
	pop	ebp
	add	esi, 10
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf7+16
nf23:	; low 4x4x1 (4 bytes)

	xor	eax, eax
	lea	ecx, nfpk_mov4l
	lea	edx, byte ptr ds:nf23_11+2

	mov	al, [esi+2]
	and	al, 0fH
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_11-nf23_11)], bl
	mov	[edx+(nf23_12-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_13-nf23_11)], bl
	mov	[edx+(nf23_14-nf23_11)], bh

	mov	al, [esi+2]
	shr	al, 4
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_31-nf23_11)], bl
	mov	[edx+(nf23_32-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_33-nf23_11)], bl
	mov	[edx+(nf23_34-nf23_11)], bh


	mov	al, [esi+3]
	and	al, 0fH
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_51-nf23_11)], bl
	mov	[edx+(nf23_52-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_53-nf23_11)], bl
	mov	[edx+(nf23_54-nf23_11)], bh

	mov	al, [esi+3]
	shr	al, 4
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_71-nf23_11)], bl
	mov	[edx+(nf23_72-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_73-nf23_11)], bl
	mov	[edx+(nf23_74-nf23_11)], bh

	mov	edx, nf_width

	; load bx,cx with 00,11 color combinations
	mov	bx, [esi]
	mov	cl, bh
	mov	bh, bl
	mov	ch, cl

	jmp	nf23_0		; flush prefetch
	ALIGN 4
nf23_0:

nf23_11:mov	ax, bx
	shl	eax, 16
nf23_12:mov	ax, bx
	mov	[edi], eax

nf23_13:mov	ax, bx
	shl	eax, 16
nf23_14:mov	ax, bx
	mov	[edi+4], eax
	add	edi, edx

nf23_31:mov	ax, bx
	shl	eax, 16
nf23_32:mov	ax, bx
	mov	[edi], eax

nf23_33:mov	ax, bx
	shl	eax, 16
nf23_34:mov	ax, bx
	mov	[edi+4], eax
	add	edi, edx

nf23_51:mov	ax, bx
	shl	eax, 16
nf23_52:mov	ax, bx
	mov	[edi], eax

nf23_53:mov	ax, bx
	shl	eax, 16
nf23_54:mov	ax, bx
	mov	[edi+4], eax
	add	edi, edx

nf23_71:mov	ax, bx
	shl	eax, 16
nf23_72:mov	ax, bx
	mov	[edi], eax

nf23_73:mov	ax, bx
	shl	eax, 16
nf23_74:mov	ax, bx
	mov	[edi+4], eax

	sub	edi, nfpk_back_right
	add	esi, 4
	retn

;----------------------------------------
	ALIGN 4
nf8:	; 2x2 4x4x1 (16 bytes)

	mov	ax, [esi]
	cmp	al, ah
	ja	nf24

	; <WIP> Note: This could be made faster with a new (16 16-bit entry) table.

	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf8_11+2	

	mov	al, [esi+2]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_11-nf8_11)], bl
	mov	[edx+(nf8_12-nf8_11)], bh

	mov	al, [esi+3]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_21-nf8_11)], bl
	mov	[edx+(nf8_22-nf8_11)], bh


	mov	al, [esi+6]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_31-nf8_11)], bl
	mov	[edx+(nf8_32-nf8_11)], bh

	mov	al, [esi+7]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_41-nf8_11)], bl
	mov	[edx+(nf8_42-nf8_11)], bh

	add	edx, nf8_51-nf8_11

	mov	al, [esi+10]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_51-nf8_51)], bl
	mov	[edx+(nf8_52-nf8_51)], bh

	mov	al, [esi+11]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_61-nf8_51)], bl
	mov	[edx+(nf8_62-nf8_51)], bh


	mov	al, [esi+14]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_71-nf8_51)], bl
	mov	[edx+(nf8_72-nf8_51)], bh

	mov	al, [esi+15]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_81-nf8_51)], bl
	mov	[edx+(nf8_82-nf8_51)], bh


	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi, nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

	jmp	nf8_0		; flush prefetch
	ALIGN 4
nf8_0:
nf8_11:	mov	ax, bx
	shl	eax, 16
nf8_12:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_21:	mov	ax, bx
	shl	eax, 16
nf8_22:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	mov	eax, [esp]
	mov	cx, [eax+4]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf8_31:	mov	ax, bx
	shl	eax, 16
nf8_32:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_41:	mov	ax, bx
	shl	eax, 16
nf8_42:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	lea	eax, [esi*4-4]
	sub	edi, eax

	mov	eax, [esp]
	mov	cx, [eax+8]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf8_51:	mov	ax, bx
	shl	eax, 16
nf8_52:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_61:	mov	ax, bx
	shl	eax, 16
nf8_62:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	mov	eax, [esp]
	mov	cx, [eax+12]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf8_71:	mov	ax, bx
	shl	eax, 16
nf8_72:	mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf8_81:	mov	ax, bx
	shl	eax, 16
nf8_82:	mov	ax, bx
	mov	[edi], eax

	pop	esi
	pop	ebp
	add	esi, 16
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf8+16
nf24:	; 2x1 4x8x1 (12 bytes)

	mov	ax, [esi+6]
	cmp	al, ah
	ja	nf40

	; <WIP> Note: This could be made faster with a new (16 16-bit entry) table.

	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf24_11+2	

	mov	al, [esi+2]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_11-nf24_11)], bl
	mov	[edx+(nf24_12-nf24_11)], bh

	mov	al, [esi+3]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_21-nf24_11)], bl
	mov	[edx+(nf24_22-nf24_11)], bh

	mov	al, [esi+4]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_31-nf24_11)], bl
	mov	[edx+(nf24_32-nf24_11)], bh

	mov	al, [esi+5]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_41-nf24_11)], bl
	mov	[edx+(nf24_42-nf24_11)], bh

	add	edx, nf24_51-nf24_11

	mov	al, [esi+8]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_51-nf24_51)], bl
	mov	[edx+(nf24_52-nf24_51)], bh

	mov	al, [esi+9]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_61-nf24_51)], bl
	mov	[edx+(nf24_62-nf24_51)], bh

	mov	al, [esi+10]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_71-nf24_51)], bl
	mov	[edx+(nf24_72-nf24_51)], bh

	mov	al, [esi+11]
	mov	bl, al
	shr	bl, 4
	xor	bl, al
	and	bl, 0aH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_81-nf24_51)], bl
	mov	[edx+(nf24_82-nf24_51)], bh

	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi, nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

	jmp	nf24_0		; flush prefetch
	ALIGN 4
nf24_0:
nf24_11:mov	ax, bx
	shl	eax, 16
nf24_12:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_21:mov	ax, bx
	shl	eax, 16
nf24_22:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_31:mov	ax, bx
	shl	eax, 16
nf24_32:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_41:mov	ax, bx
	shl	eax, 16
nf24_42:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

	lea	eax, [esi*4-4]
	sub	edi, eax

	mov	eax, [esp]
	mov	cx, [eax+6]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf24_51:mov	ax, bx
	shl	eax, 16
nf24_52:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_61:mov	ax, bx
	shl	eax, 16
nf24_62:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_71:mov	ax, bx
	shl	eax, 16
nf24_72:mov	ax, bx
	mov	[edi], eax
	add	edi, esi

nf24_81:mov	ax, bx
	shl	eax, 16
nf24_82:mov	ax, bx
	mov	[edi], eax

	pop	esi
	pop	ebp
	add	esi, 12
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf8+32
nf40:	; 1x2 8x4x1 (12 bytes)
	xor	eax, eax
	lea	ecx, nfpk_mov8
	lea	edx, byte ptr ds:nf40_11+2	

	mov	al, [esi+2]
	mov	bl, al
	xor	bl, [esi+3]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_11-nf40_11)], bl
	mov	[edx+(nf40_12-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_13-nf40_11)], bl
	mov	[edx+(nf40_14-nf40_11)], bh

	mov	al, [esi+4]
	mov	bl, al
	xor	bl, [esi+5]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_31-nf40_11)], bl
	mov	[edx+(nf40_32-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_33-nf40_11)], bl
	mov	[edx+(nf40_34-nf40_11)], bh

	add	edx, nf40_51-nf40_11

	mov	al, [esi+8]
	mov	bl, al
	xor	bl, [esi+9]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_51-nf40_51)], bl
	mov	[edx+(nf40_52-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_53-nf40_51)], bl
	mov	[edx+(nf40_54-nf40_51)], bh

	mov	al, [esi+10]
	mov	bl, al
	xor	bl, [esi+11]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_71-nf40_51)], bl
	mov	[edx+(nf40_72-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_73-nf40_51)], bl
	mov	[edx+(nf40_74-nf40_51)], bh

	push	ebp
	push	esi
	; load bx,dx,cx,bp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
	mov	cx, [esi]
	mov	esi, nf_width
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

	jmp	nf40_0		; flush prefetch
	ALIGN 4
nf40_0:
nf40_11:mov	ax, bx
	shl	eax, 16
nf40_12:mov	ax, bx
	mov	[edi], eax
nf40_13:mov	ax, bx
	shl	eax, 16
nf40_14:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf40_31:mov	ax, bx
	shl	eax, 16
nf40_32:mov	ax, bx
	mov	[edi], eax
nf40_33:mov	ax, bx
	shl	eax, 16
nf40_34:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

	mov	eax, [esp]
	mov	cx, [eax+6]
	mov	bl,cl
	mov	bh,cl
	mov	dl,ch
	mov	dh,cl
	mov	al,ch
	mov	ah,ch
	mov	ebp,eax

nf40_51:mov	ax, bx
	shl	eax, 16
nf40_52:mov	ax, bx
	mov	[edi], eax
nf40_53:mov	ax, bx
	shl	eax, 16
nf40_54:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf40_71:mov	ax, bx
	shl	eax, 16
nf40_72:mov	ax, bx
	mov	[edi], eax
nf40_73:mov	ax, bx
	shl	eax, 16
nf40_74:mov	ax, bx
	mov	[edi+4], eax

	pop	esi
	pop	ebp
	add	esi, 12
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
nf9:	;     8x8x2 (20 bytes)

	mov	eax, [esi]
	cmp	al, ah
	ja	nf41

	shr	eax, 16
	cmp	al, ah
	ja	nf25

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf9_11+1

	mov	al, [esi+4]
	mov	bl, al
	xor	bl, [esi+6]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_11-nf9_11)], bl
	mov	[edx+(nf9_12-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_13-nf9_11)], bl
	mov	[edx+(nf9_14-nf9_11)], bh

	mov	al, [esi+5]
	mov	bl, al
	xor	bl, [esi+7]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_15-nf9_11)], bl
	mov	[edx+(nf9_16-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_17-nf9_11)], bl
	mov	[edx+(nf9_18-nf9_11)], bh


	mov	al, [esi+8]
	mov	bl, al
	xor	bl, [esi+10]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_31-nf9_11)], bl
	mov	[edx+(nf9_32-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_33-nf9_11)], bl
	mov	[edx+(nf9_34-nf9_11)], bh

	mov	al, [esi+9]
	mov	bl, al
	xor	bl, [esi+11]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_35-nf9_11)], bl
	mov	[edx+(nf9_36-nf9_11)], bh
	shr	ebx, 16
	mov	[edx+(nf9_37-nf9_11)], bl
	mov	[edx+(nf9_38-nf9_11)], bh

	lea	edx, [edx+(nf9_51-nf9_11)]

	mov	al, [esi+12]
	mov	bl, al
	xor	bl, [esi+14]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_51-nf9_51)], bl
	mov	[edx+(nf9_52-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_53-nf9_51)], bl
	mov	[edx+(nf9_54-nf9_51)], bh

	mov	al, [esi+13]
	mov	bl, al
	xor	bl, [esi+15]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_55-nf9_51)], bl
	mov	[edx+(nf9_56-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_57-nf9_51)], bl
	mov	[edx+(nf9_58-nf9_51)], bh


	mov	al, [esi+16]
	mov	bl, al
	xor	bl, [esi+18]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_71-nf9_51)], bl
	mov	[edx+(nf9_72-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_73-nf9_51)], bl
	mov	[edx+(nf9_74-nf9_51)], bh

	mov	al, [esi+17]
	mov	bl, al
	xor	bl, [esi+19]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_75-nf9_51)], bl
	mov	[edx+(nf9_76-nf9_51)], bh
	shr	ebx, 16
	mov	[edx+(nf9_77-nf9_51)], bl
	mov	[edx+(nf9_78-nf9_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf9_0		; flush prefetch
	ALIGN 4
nf9_0:
nf9_11:	mov	al, bl
nf9_12:	mov	ah, bl
	shl	eax, 16
nf9_13:	mov	al, bl
nf9_14:	mov	ah, bl
	mov	[edi], eax

nf9_15:	mov	al, bl
nf9_16:	mov	ah, bl
	shl	eax, 16
nf9_17:	mov	al, bl
nf9_18:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_31:	mov	al, bl
nf9_32:	mov	ah, bl
	shl	eax, 16
nf9_33:	mov	al, bl
nf9_34:	mov	ah, bl
	mov	[edi], eax

nf9_35:	mov	al, bl
nf9_36:	mov	ah, bl
	shl	eax, 16
nf9_37:	mov	al, bl
nf9_38:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_51:	mov	al, bl
nf9_52:	mov	ah, bl
	shl	eax, 16
nf9_53:	mov	al, bl
nf9_54:	mov	ah, bl
	mov	[edi], eax

nf9_55:	mov	al, bl
nf9_56:	mov	ah, bl
	shl	eax, 16
nf9_57:	mov	al, bl
nf9_58:	mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf9_71:	mov	al, bl
nf9_72:	mov	ah, bl
	shl	eax, 16
nf9_73:	mov	al, bl
nf9_74:	mov	ah, bl
	mov	[edi], eax

nf9_75:	mov	al, bl
nf9_76:	mov	ah, bl
	shl	eax, 16
nf9_77:	mov	al, bl
nf9_78:	mov	ah, bl
	mov	[edi+4], eax

	add	esi, 20
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
;nf9+16
nf25:	; low 4x4x2 (8 bytes)

if 0 ;debug
	mov	eax, 0
	mov	ebx, 0
	add	esi, 8
	jmp	nf_solid
endif

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf25_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_14-nf25_11)], bl
	mov	[edx+(nf25_13-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_12-nf25_11)], bl
	mov	[edx+(nf25_11-nf25_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_24-nf25_11)], bl
	mov	[edx+(nf25_23-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_22-nf25_11)], bl
	mov	[edx+(nf25_21-nf25_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_34-nf25_11)], bl
	mov	[edx+(nf25_33-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_32-nf25_11)], bl
	mov	[edx+(nf25_31-nf25_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_44-nf25_11)], bl
	mov	[edx+(nf25_43-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_42-nf25_11)], bl
	mov	[edx+(nf25_41-nf25_11)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf25_0		; flush prefetch
	ALIGN 4
nf25_0:
nf25_11:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_12:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf25_13:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_14:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf25_21:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_22:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf25_23:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_24:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf25_31:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_32:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf25_33:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_34:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax
	add	edi, edx

nf25_41:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_42:mov	al, bl
	mov	ah, al
	mov	[edi], eax
nf25_43:mov	ah, bl
	mov	al, ah
	shl	eax, 16
nf25_44:mov	al, bl
	mov	ah, al
	mov	[edi+4], eax

	add	esi, 8
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf9+32
nf41:	; low 4x8x2 (12 bytes)
	shr	eax, 16
	cmp	al, ah
	ja	nf57

if 0 ;debug
	mov	eax, 0
	mov	ebx, 0
	add	esi, 12
	jmp	nf_solid
endif

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf41_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_14-nf41_11)], bl
	mov	[edx+(nf41_13-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_12-nf41_11)], bl
	mov	[edx+(nf41_11-nf41_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_24-nf41_11)], bl
	mov	[edx+(nf41_23-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_22-nf41_11)], bl
	mov	[edx+(nf41_21-nf41_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_34-nf41_11)], bl
	mov	[edx+(nf41_33-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_32-nf41_11)], bl
	mov	[edx+(nf41_31-nf41_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_44-nf41_11)], bl
	mov	[edx+(nf41_43-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_42-nf41_11)], bl
	mov	[edx+(nf41_41-nf41_11)], bh

	lea	edx, [edx+(nf41_51-nf41_11)]

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_54-nf41_51)], bl
	mov	[edx+(nf41_53-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_52-nf41_51)], bl
	mov	[edx+(nf41_51-nf41_51)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_64-nf41_51)], bl
	mov	[edx+(nf41_63-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_62-nf41_51)], bl
	mov	[edx+(nf41_61-nf41_51)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_74-nf41_51)], bl
	mov	[edx+(nf41_73-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_72-nf41_51)], bl
	mov	[edx+(nf41_71-nf41_51)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_84-nf41_51)], bl
	mov	[edx+(nf41_83-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_82-nf41_51)], bl
	mov	[edx+(nf41_81-nf41_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf41_0		; flush prefetch
	ALIGN 4
nf41_0:
nf41_11:mov	ah, bl
	mov	al, ah
nf41_21:mov	ah, bl
	shl	eax, 16
nf41_22:mov	al, bl
	mov	ah, al
nf41_12:mov	al, bl
	mov	[edi], eax
nf41_13:mov	ah, bl
	mov	al, ah
nf41_23:mov	ah, bl
	shl	eax, 16
nf41_24:mov	al, bl
	mov	ah, al
nf41_14:mov	al, bl
	mov	[edi+4], eax
	add	edi, edx

nf41_31:mov	ah, bl
	mov	al, ah
nf41_41:mov	ah, bl
	shl	eax, 16
nf41_42:mov	al, bl
	mov	ah, al
nf41_32:mov	al, bl
	mov	[edi], eax
nf41_33:mov	ah, bl
	mov	al, ah
nf41_43:mov	ah, bl
	shl	eax, 16
nf41_44:mov	al, bl
	mov	ah, al
nf41_34:mov	al, bl
	mov	[edi+4], eax
	add	edi, edx

nf41_51:mov	ah, bl
	mov	al, ah
nf41_61:mov	ah, bl
	shl	eax, 16
nf41_62:mov	al, bl
	mov	ah, al
nf41_52:mov	al, bl
	mov	[edi], eax
nf41_53:mov	ah, bl
	mov	al, ah
nf41_63:mov	ah, bl
	shl	eax, 16
nf41_64:mov	al, bl
	mov	ah, al
nf41_54:mov	al, bl
	mov	[edi+4], eax
	add	edi, edx

nf41_71:mov	ah, bl
	mov	al, ah
nf41_81:mov	ah, bl
	shl	eax, 16
nf41_82:mov	al, bl
	mov	ah, al
nf41_72:mov	al, bl
	mov	[edi], eax
nf41_73:mov	ah, bl
	mov	al, ah
nf41_83:mov	ah, bl
	shl	eax, 16
nf41_84:mov	al, bl
	mov	ah, al
nf41_74:mov	al, bl
	mov	[edi+4], eax

	add	esi, 12
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf9+48
nf57:	; low 8x4x2 (12 bytes)
	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf57_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_11-nf57_11)], bl
	mov	[edx+(nf57_12-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_13-nf57_11)], bl
	mov	[edx+(nf57_14-nf57_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_15-nf57_11)], bl
	mov	[edx+(nf57_16-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_17-nf57_11)], bl
	mov	[edx+(nf57_18-nf57_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_21-nf57_11)], bl
	mov	[edx+(nf57_22-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_23-nf57_11)], bl
	mov	[edx+(nf57_24-nf57_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_25-nf57_11)], bl
	mov	[edx+(nf57_26-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_27-nf57_11)], bl
	mov	[edx+(nf57_28-nf57_11)], bh


	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_31-nf57_11)], bl
	mov	[edx+(nf57_32-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_33-nf57_11)], bl
	mov	[edx+(nf57_34-nf57_11)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_35-nf57_11)], bl
	mov	[edx+(nf57_36-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_37-nf57_11)], bl
	mov	[edx+(nf57_38-nf57_11)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_41-nf57_11)], bl
	mov	[edx+(nf57_42-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_43-nf57_11)], bl
	mov	[edx+(nf57_44-nf57_11)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_45-nf57_11)], bl
	mov	[edx+(nf57_46-nf57_11)], bh
	shr	ebx, 16
	mov	[edx+(nf57_47-nf57_11)], bl
	mov	[edx+(nf57_48-nf57_11)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf57_0		; flush prefetch
	ALIGN 4
nf57_0:
nf57_11:mov	al, bl
nf57_12:mov	ah, bl
	shl	eax, 16
nf57_13:mov	al, bl
nf57_14:mov	ah, bl
	mov	[edi], eax

nf57_15:mov	al, bl
nf57_16:mov	ah, bl
	shl	eax, 16
nf57_17:mov	al, bl
nf57_18:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf57_21:mov	al, bl
nf57_22:mov	ah, bl
	shl	eax, 16
nf57_23:mov	al, bl
nf57_24:mov	ah, bl
	mov	[edi], eax

nf57_25:mov	al, bl
nf57_26:mov	ah, bl
	shl	eax, 16
nf57_27:mov	al, bl
nf57_28:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf57_31:mov	al, bl
nf57_32:mov	ah, bl
	shl	eax, 16
nf57_33:mov	al, bl
nf57_34:mov	ah, bl
	mov	[edi], eax

nf57_35:mov	al, bl
nf57_36:mov	ah, bl
	shl	eax, 16
nf57_37:mov	al, bl
nf57_38:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf57_41:mov	al, bl
nf57_42:mov	ah, bl
	shl	eax, 16
nf57_43:mov	al, bl
nf57_44:mov	ah, bl
	mov	[edi], eax

nf57_45:mov	al, bl
nf57_46:mov	ah, bl
	shl	eax, 16
nf57_47:mov	al, bl
nf57_48:mov	ah, bl
	mov	[edi+4], eax

	add	esi, 12
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
nf10:	; 2x2 4x4x2 (32 bytes)

	mov	ax, [esi]
	cmp	al, ah
	ja	nf26

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf10_11+1

	mov	al, [esi+4]
	mov	bl, al
	xor	bl, [esi+5]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_11-nf10_11)], bl
	mov	[edx+(nf10_12-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_13-nf10_11)], bl
	mov	[edx+(nf10_14-nf10_11)], bh

	mov	al, [esi+6]
	mov	bl, al
	xor	bl, [esi+7]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_21-nf10_11)], bl
	mov	[edx+(nf10_22-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_23-nf10_11)], bl
	mov	[edx+(nf10_24-nf10_11)], bh

	mov	al, [esi+12]
	mov	bl, al
	xor	bl, [esi+13]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_31-nf10_11)], bl
	mov	[edx+(nf10_32-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_33-nf10_11)], bl
	mov	[edx+(nf10_34-nf10_11)], bh

	mov	al, [esi+14]
	mov	bl, al
	xor	bl, [esi+15]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_41-nf10_11)], bl
	mov	[edx+(nf10_42-nf10_11)], bh
	shr	ebx, 16
	mov	[edx+(nf10_43-nf10_11)], bl
	mov	[edx+(nf10_44-nf10_11)], bh

	lea	edx, [edx+(nf10_51-nf10_11)]

	mov	al, [esi+20]
	mov	bl, al
	xor	bl, [esi+21]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_51-nf10_51)], bl
	mov	[edx+(nf10_52-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_53-nf10_51)], bl
	mov	[edx+(nf10_54-nf10_51)], bh

	mov	al, [esi+22]
	mov	bl, al
	xor	bl, [esi+23]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_61-nf10_51)], bl
	mov	[edx+(nf10_62-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_63-nf10_51)], bl
	mov	[edx+(nf10_64-nf10_51)], bh

	mov	al, [esi+28]
	mov	bl, al
	xor	bl, [esi+29]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_71-nf10_51)], bl
	mov	[edx+(nf10_72-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_73-nf10_51)], bl
	mov	[edx+(nf10_74-nf10_51)], bh

	mov	al, [esi+30]
	mov	bl, al
	xor	bl, [esi+31]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_81-nf10_51)], bl
	mov	[edx+(nf10_82-nf10_51)], bh
	shr	ebx, 16
	mov	[edx+(nf10_83-nf10_51)], bl
	mov	[edx+(nf10_84-nf10_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf10_0		; flush prefetch
	ALIGN 4
nf10_0:
nf10_11:mov	al, bl
nf10_12:mov	ah, bl
	shl	eax, 16
nf10_13:mov	al, bl
nf10_14:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_21:mov	al, bl
nf10_22:mov	ah, bl
	shl	eax, 16
nf10_23:mov	al, bl
nf10_24:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+8]
	mov	cx, [esi+10]

nf10_31:mov	al, bl
nf10_32:mov	ah, bl
	shl	eax, 16
nf10_33:mov	al, bl
nf10_34:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_41:mov	al, bl
nf10_42:mov	ah, bl
	shl	eax, 16
nf10_43:mov	al, bl
nf10_44:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	lea	eax, [edx*4-4]
	sub	edi, eax

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+16]
	mov	cx, [esi+18]

nf10_51:mov	al, bl
nf10_52:mov	ah, bl
	shl	eax, 16
nf10_53:mov	al, bl
nf10_54:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_61:mov	al, bl
nf10_62:mov	ah, bl
	shl	eax, 16
nf10_63:mov	al, bl
nf10_64:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+24]
	mov	cx, [esi+26]

nf10_71:mov	al, bl
nf10_72:mov	ah, bl
	shl	eax, 16
nf10_73:mov	al, bl
nf10_74:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf10_81:mov	al, bl
nf10_82:mov	ah, bl
	shl	eax, 16
nf10_83:mov	al, bl
nf10_84:mov	ah, bl
	mov	[edi], eax

	add	esi, 32
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf10+16
nf26:	; 2x1 4x8x2 (24 bytes)

	mov	ax, [esi+12]
	cmp	al, ah
	ja	nf42

if 0 ;debug
	mov	eax, 0
	mov	ebx, 0
	add	esi, 24
	jmp	nf_solid
endif

	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf26_11+1

	mov	al, [esi+4]
	mov	bl, al
	xor	bl, [esi+5]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_11-nf26_11)], bl
	mov	[edx+(nf26_12-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_13-nf26_11)], bl
	mov	[edx+(nf26_14-nf26_11)], bh

	mov	al, [esi+6]
	mov	bl, al
	xor	bl, [esi+7]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_21-nf26_11)], bl
	mov	[edx+(nf26_22-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_23-nf26_11)], bl
	mov	[edx+(nf26_24-nf26_11)], bh

	mov	al, [esi+8]
	mov	bl, al
	xor	bl, [esi+9]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_31-nf26_11)], bl
	mov	[edx+(nf26_32-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_33-nf26_11)], bl
	mov	[edx+(nf26_34-nf26_11)], bh

	mov	al, [esi+10]
	mov	bl, al
	xor	bl, [esi+11]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_41-nf26_11)], bl
	mov	[edx+(nf26_42-nf26_11)], bh
	shr	ebx, 16
	mov	[edx+(nf26_43-nf26_11)], bl
	mov	[edx+(nf26_44-nf26_11)], bh

	lea	edx, [edx+(nf26_51-nf26_11)]

	mov	al, [esi+16]
	mov	bl, al
	xor	bl, [esi+17]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_51-nf26_51)], bl
	mov	[edx+(nf26_52-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_53-nf26_51)], bl
	mov	[edx+(nf26_54-nf26_51)], bh

	mov	al, [esi+18]
	mov	bl, al
	xor	bl, [esi+19]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_61-nf26_51)], bl
	mov	[edx+(nf26_62-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_63-nf26_51)], bl
	mov	[edx+(nf26_64-nf26_51)], bh

	mov	al, [esi+20]
	mov	bl, al
	xor	bl, [esi+21]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_71-nf26_51)], bl
	mov	[edx+(nf26_72-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_73-nf26_51)], bl
	mov	[edx+(nf26_74-nf26_51)], bh

	mov	al, [esi+22]
	mov	bl, al
	xor	bl, [esi+23]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_81-nf26_51)], bl
	mov	[edx+(nf26_82-nf26_51)], bh
	shr	ebx, 16
	mov	[edx+(nf26_83-nf26_51)], bl
	mov	[edx+(nf26_84-nf26_51)], bh

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf26_0		; flush prefetch
	ALIGN 4
nf26_0:
nf26_11:mov	al, bl
nf26_12:mov	ah, bl
	shl	eax, 16
nf26_13:mov	al, bl
nf26_14:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_21:mov	al, bl
nf26_22:mov	ah, bl
	shl	eax, 16
nf26_23:mov	al, bl
nf26_24:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_31:mov	al, bl
nf26_32:mov	ah, bl
	shl	eax, 16
nf26_33:mov	al, bl
nf26_34:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_41:mov	al, bl
nf26_42:mov	ah, bl
	shl	eax, 16
nf26_43:mov	al, bl
nf26_44:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

	lea	eax, [edx*4-4]
	sub	edi, eax

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+12]
	mov	cx, [esi+14]

nf26_51:mov	al, bl
nf26_52:mov	ah, bl
	shl	eax, 16
nf26_53:mov	al, bl
nf26_54:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_61:mov	al, bl
nf26_62:mov	ah, bl
	shl	eax, 16
nf26_63:mov	al, bl
nf26_64:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_71:mov	al, bl
nf26_72:mov	ah, bl
	shl	eax, 16
nf26_73:mov	al, bl
nf26_74:mov	ah, bl
	mov	[edi], eax
	add	edi, edx

nf26_81:mov	al, bl
nf26_82:mov	ah, bl
	shl	eax, 16
nf26_83:mov	al, bl
nf26_84:mov	ah, bl
	mov	[edi], eax

	add	esi, 24
	sub	edi, 4
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf10+32
nf42:	; 1x2 8x4x2 (24 bytes)

if 0 ;debug
	mov	eax, 0
	mov	ebx, 0
	add	esi, 24
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfpk_mov4
	lea	edx, byte ptr ds:nf42_11+1

	mov	al, [esi+4]
	mov	bl, al
	xor	bl, [esi+6]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_11-nf42_11)], bl
	mov	[edx+(nf42_12-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_13-nf42_11)], bl
	mov	[edx+(nf42_14-nf42_11)], bh

	mov	al, [esi+5]
	mov	bl, al
	xor	bl, [esi+7]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_15-nf42_11)], bl
	mov	[edx+(nf42_16-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_17-nf42_11)], bl
	mov	[edx+(nf42_18-nf42_11)], bh


	mov	al, [esi+8]
	mov	bl, al
	xor	bl, [esi+10]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_31-nf42_11)], bl
	mov	[edx+(nf42_32-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_33-nf42_11)], bl
	mov	[edx+(nf42_34-nf42_11)], bh

	mov	al, [esi+9]
	mov	bl, al
	xor	bl, [esi+11]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_35-nf42_11)], bl
	mov	[edx+(nf42_36-nf42_11)], bh
	shr	ebx, 16
	mov	[edx+(nf42_37-nf42_11)], bl
	mov	[edx+(nf42_38-nf42_11)], bh


	lea	edx, [edx+(nf42_51-nf42_11)]

	mov	al, [esi+16]
	mov	bl, al
	xor	bl, [esi+18]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_51-nf42_51)], bl
	mov	[edx+(nf42_52-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_53-nf42_51)], bl
	mov	[edx+(nf42_54-nf42_51)], bh

	mov	al, [esi+17]
	mov	bl, al
	xor	bl, [esi+19]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_55-nf42_51)], bl
	mov	[edx+(nf42_56-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_57-nf42_51)], bl
	mov	[edx+(nf42_58-nf42_51)], bh


	mov	al, [esi+20]
	mov	bl, al
	xor	bl, [esi+22]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_71-nf42_51)], bl
	mov	[edx+(nf42_72-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_73-nf42_51)], bl
	mov	[edx+(nf42_74-nf42_51)], bh

	mov	al, [esi+21]
	mov	bl, al
	xor	bl, [esi+23]
	and	bl, 0ccH
	xor	al, bl
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_75-nf42_51)], bl
	mov	[edx+(nf42_76-nf42_51)], bh
	shr	ebx, 16
	mov	[edx+(nf42_77-nf42_51)], bl
	mov	[edx+(nf42_78-nf42_51)], bh


	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi]
	mov	cx, [esi+2]

	mov	edx, nf_width
	jmp	nf42_0		; flush prefetch
	ALIGN 4
nf42_0:
nf42_11:mov	al, bl
nf42_12:mov	ah, bl
	shl	eax, 16
nf42_13:mov	al, bl
nf42_14:mov	ah, bl
	mov	[edi], eax

nf42_15:mov	al, bl
nf42_16:mov	ah, bl
	shl	eax, 16
nf42_17:mov	al, bl
nf42_18:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf42_31:mov	al, bl
nf42_32:mov	ah, bl
	shl	eax, 16
nf42_33:mov	al, bl
nf42_34:mov	ah, bl
	mov	[edi], eax

nf42_35:mov	al, bl
nf42_36:mov	ah, bl
	shl	eax, 16
nf42_37:mov	al, bl
nf42_38:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

	; Load bl,bh,cl,ch with four colors
	mov	bx, [esi+12]
	mov	cx, [esi+14]

nf42_51:mov	al, bl
nf42_52:mov	ah, bl
	shl	eax, 16
nf42_53:mov	al, bl
nf42_54:mov	ah, bl
	mov	[edi], eax

nf42_55:mov	al, bl
nf42_56:mov	ah, bl
	shl	eax, 16
nf42_57:mov	al, bl
nf42_58:mov	ah, bl
	mov	[edi+4], eax
	add	edi, edx

nf42_71:mov	al, bl
nf42_72:mov	ah, bl
	shl	eax, 16
nf42_73:mov	al, bl
nf42_74:mov	ah, bl
	mov	[edi], eax

nf42_75:mov	al, bl
nf42_76:mov	ah, bl
	shl	eax, 16
nf42_77:mov	al, bl
nf42_78:mov	ah, bl
	mov	[edi+4], eax

	add	esi, 24
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
nf11:	;     8x8x8 (64 bytes)
if 0 ;debug
	add	esi, 64
	mov	eax, 0fefefefeH
;	mov	ebx, eax
	mov	ebx, 0
	jmp	nf_solid
endif
	mov	edx, nf_width
	mov	ebx, 0ff00ff00H

	mov eax, [esi]		;0
	mov ecx, eax
	xor ecx, [esi+8]
	and ecx, ebx
	xor eax, ecx
	mov [edi], eax
	mov eax, [esi+4]
	mov ecx, eax
	xor ecx, [esi+4+8]
	and ecx, ebx
	xor eax, ecx
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+16]	;2
	mov ecx, eax
	xor ecx, [esi+16+8]
	and ecx, ebx
	xor eax, ecx
	mov [edi], eax
	mov eax, [esi+20]
	mov ecx, eax
	xor ecx, [esi+20+8]
	and ecx, ebx
	xor eax, ecx
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+32]	;4
	mov ecx, eax
	xor ecx, [esi+32+8]
	and ecx, ebx
	xor eax, ecx
	mov [edi], eax
	mov eax, [esi+36]
	mov ecx, eax
	xor ecx, [esi+36+8]
	and ecx, ebx
	xor eax, ecx
	mov [edi+4], eax
	add	edi, edx
	mov eax, [esi+48]	;6
	mov ecx, eax
	xor ecx, [esi+48+8]
	and ecx, ebx
	xor eax, ecx
	mov [edi], eax
	mov eax, [esi+52]
	mov ecx, eax
	xor ecx, [esi+52+8]
	and ecx, ebx
	xor eax, ecx
	mov [edi+4], eax

	add	esi, 64
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	retn

;----------------------------------------
	ALIGN 4
nf12:	; low 4x4x8 (16 bytes)
	mov	edx, nf_width

	mov	eax, [esi]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx
	add	edi, edx

	mov	eax, [esi+4]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx
	add	edi, edx

	mov	eax, [esi+8]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx
	add	edi, edx

	mov	eax, [esi+12]
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi], ebx
	shr	eax, 16
	mov	bl, ah
	mov	bh, ah
	shl	ebx, 16
	mov	bl, al
	mov	bh, al
	mov	[edi+4], ebx

	sub	edi, nfpk_back_right
	add	esi, 16
	retn

;----------------------------------------
	ALIGN 4
nf13:	; 2x2 4x4x0 (4 bytes)
	mov	edx, nf_width

	mov	cl, [esi]
	mov	ch, cl
	mov	eax, ecx
	shl	eax, 16
	mov	ax, cx

	mov	cl, [esi+1]
	mov	ch, cl
	mov	ebx, ecx
	shl	ebx, 16
	mov	bx, cx

	mov	[edi], eax
	mov	[edi+4], ebx
	mov	[edi+edx], eax
	mov	[edi+edx+4], ebx
	lea	edi, [edi+edx*2]

	mov	cl, [esi+2]
	mov	ch, cl
	mov	eax, ecx
	shl	eax, 16
	mov	ax, cx

	mov	cl, [esi+3]
	mov	ch, cl
	mov	ebx, ecx
	shl	ebx, 16
	mov	bx, cx

	mov	[edi], eax
	mov	[edi+4], ebx
	add	edi, edx
	mov	[edi], eax
	mov	[edi+4], ebx

	sub	edi, nfpk_back_right
	add	esi, 4

	retn

;----------------------------------------
	ALIGN 4
nf14:	;     8x8x0 (1 byte)
	mov	bl, [esi]		; Copy color into 8 positions
	inc	esi
	mov	bh, bl
	mov	eax, ebx
	shl	eax, 16
	mov	ax, bx
	mov	ebx, eax
if 0 ;debug
	mov	eax, 080808080h
	mov	ebx, eax
endif
	jmp	nf_solid

	retn

;----------------------------------------
	ALIGN 4
nf15:	; mix 8x8x0 (2 bytes)
	mov	bx, [esi]		; Copy 2 colors into 8 positions
	add	esi, 2			;  in a checkerboard
	mov	ax, bx
	shl	eax, 16
	mov	ax, bx
	mov	ebx, eax
	rol	ebx, 8
if 0 ;debug
	mov	eax, 080808080h
	mov	ebx, eax
endif
nf_solid:
	mov	edx, nf_width

	mov	[edi], eax
	mov	[edi+4], eax
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	add	edi, edx
	mov	[edi], eax
	mov	[edi+4], eax
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

nfPkDecompD ENDP

endif

;---

	.data

; Constant tables

nfhpk_mov4l	LABEL DWORD
; low 4x1 in 8x1 (patch +1)
;		mov eax, ebx/ecx
MOVH4L_REGS	TEXTEQU <!<0c0h+3,0c0h+1!>>
%FOR m4, MOVH4L_REGS
% FOR m3, MOVH4L_REGS
%  FOR m2, MOVH4L_REGS
%   FOR m1, MOVH4L_REGS
	BYTE m1,m2,m3,m4
    ENDM
   ENDM
  ENDM
 ENDM

nfhpk_mov8	LABEL DWORD
; 8x1			(each two bits select a pair of colors in a reg)
; low 4x2 in 8x2	(each two bits select a duplicated color in reg)
; (patch +1)
;		mov ds:[edi+0/4/8/12], ebx/edx/ecx/ebp
; Note: Patched code specifies mov [ebp+0]... instead
;  of mov [edi+0]... to insure that 8-bit offsets are
;  used by the assembler even for offset of zero.
;
MOVH8_REGS	TEXTEQU <!<3*8,2*8,1*8,5*8!>>
%FOR m4, MOVH8_REGS
% FOR m3, MOVH8_REGS
%   FOR m2, MOVH8_REGS
%     FOR m1, MOVH8_REGS
	BYTE m1+047h,m2+047h,m3+047h,m4+047h
      ENDM
    ENDM
  ENDM
 ENDM

nfhpk_mov4	LABEL DWORD
; 4x2 (patch +2)
;		mov ax, bx/dx/cx/bp
; low 4x2 in 8x2 (patch +1)
;		mov eax, ebx/edx/ecx/ebp
MOVH4_REGS	TEXTEQU <!<0c0h+3,0c0h+2,0c0h+1,0c0h+5!>>
%FOR m4, MOVH4_REGS
% FOR m3, MOVH4_REGS
%   FOR m2, MOVH4_REGS
%     FOR m1, MOVH4_REGS
	BYTE m1,m2,m3,m4
      ENDM
    ENDM
  ENDM
 ENDM

	.code

; Normal version (HiColor)
;

if TRANS16

if 0
Trans16	MACRO dst:req, idx:req, mask
	mov	dst, [idx]
  ifnb <mask>
	and	dst, 07FFFh
  endif
	ENDM
elseif 0
Trans16 MACRO dst:req, idx:req, mask
	mov	dst, [idx]
	mov	ax, dst
	and	ax, 0FFE0h
	add	dst, ax
	ENDM
else
EXTERN nf_trans16_lo: WORD
EXTERN nf_trans16_hi: WORD
Trans16	MACRO dst:req, idx:req, mask
	xor	eax, eax
	mov	al, [idx]
	mov	dst, nf_trans16_lo[eax*2]
	xor	eax, eax
	mov	al, [idx+1]
	or	dst, nf_trans16_hi[eax*2]
	ENDM
endif

else

Trans16	MACRO dst:req, idx:req, mask
	mov	dst, [idx]
  ifnb <mask>
	and	dst, 07FFFh
  endif
	ENDM

endif

nfHPkDecomp PROC USES ESI EDI EBX, \
		ops:PTRBYTE, comp:PTRBYTE, \
		x:DWORD, y:DWORD, w:DWORD, h:DWORD
	LOCAL tbuf: PTRBYTE
	LOCAL new_row:DWORD
	LOCAL DiffBufPtrs:DWORD

	LOCAL nfpk_back_right: DWORD
	LOCAL wcnt:DWORD
	LOCAL bcomp:PTRBYTE

	LOG_LABEL "StartPkDecomp"

.data
nfhpk_OpTbl label dword
	dword offset nf0	; Prev Same (0)
	dword offset nf1	; No change (and copied to screen) (0)
	dword offset nf2	; Near shift from older part of current buf (1)
	dword offset nf3	; Near shift from newer part of current buf (1)
	dword offset nf4	; Near shift from previous buffer (1)
	dword offset nf5	; Far shift from previous buffer (2)
	dword offset nf6	; Far shift from current buffer (2)
				; [Or if COMPOPS, run of no changes (0)]
	dword offset nf7	;     8x8x1 (10 bytes) or low 4x4x1 (4 bytes)
	dword offset nf8	; 2x2 4x4x1 (16 bytes) or 2x1 4x8x1 (12 bytes) or 1x2 8x4x1 (12 bytes)
	dword offset nf9	;     8x8x2 (20 bytes) or low 4x4x2 (8 bytes) or
				; low 4x8x2 (12 bytes) or low 8x4x2 (12 bytes)
	dword offset nf10	; 2x2 4x4x2 (32 bytes) or 2x1 4x8x2 (24 bytes) or 1x2 4x8x2 (24 bytes)
	dword offset nf11	;     8x8x8 (64 bytes)
	dword offset nf12	; low 4x4x8 (16 bytes)
	dword offset nf13	; 2x2 4x4x0 (ie 2x2x8) (4 bytes)
	dword offset nf14	;     8x8x0 (1 byte)
	dword offset nf15	; mix 8x8x0 (2 bytes)
.code

ifdef SYMANTEC
	mov	ebx, ds		; Allow DS to access code
	mov	ecx, 0
	mov	ax, 3505h
	int 21h
endif

	NF_DECOMP_INIT	1

	mov	eax, nf_back_right
	sub	eax, SWIDTH*2
	mov	nfpk_back_right, eax

	mov	esi, comp
	mov	edi, tbuf

	xor	eax, eax
	mov	ax, [esi]
	add	eax, esi
	mov	bcomp, eax
	add	esi, 2

nf_StartRow:
	mov	eax, w
	shr	eax, 1
	mov	wcnt,eax
	ALIGN 4
nf_NextPair:
	dec	wcnt
	js	nf_NextRow
	mov	ebx, ops
	mov	al, [ebx]
	inc	ebx
	mov	ops, ebx

	xor	ebx, ebx
	mov	bl, al
	shr	bl, 4
	and	eax, 0Fh
	push	offset nf_NextPair
	push	nfhpk_OpTbl[ebx*4]
	jmp	nfhpk_OpTbl[eax*4]

nf_NextRow:
	add	edi, new_row
	dec	h
	jnz	nf_StartRow
	LOG_LABEL "EndPkDecomp"

ifdef SYMANTEC
	mov	ebx, ds		; Disable DS from accessing code
	mov	ecx, offset DGROUP:_data_bottom[-1]
	mov	ax, 3505h
	int 21h
endif
	ret

;----------------------------------------
	ALIGN 4
nf0:	; No change from previous buffer
	mov	eax, DiffBufPtrs
	jmp	nf_shift

;----------------------------------------
	ALIGN 4
nf1:	; No change (and copied to screen)
if 0 ;debug
	mov	ebx, 0
	jmp	nf_solid
endif
	add	edi, SWIDTH*2
	retn

;----------------------------------------
	ALIGN 4
nf2:	; Near shift from older part of current buffer
	xor	eax, eax
	mov	ebx, bcomp
	inc	bcomp
	mov	al, [ebx]
	mov	ax, nfpk_ShiftP2[eax*2]
nf_xyc_shift:
	xor	ebx, ebx
	mov	bl, ah
	shl	eax, 24
	sar	eax, 24-1
	add	eax, nfpk_ShiftY[ebx*4]
	jmp	nf_shift

;----------------------------------------
	ALIGN 4
nf3:	; Near shift from newer part of current buffer
	xor	eax, eax
	mov	ebx, bcomp
	inc	bcomp
	mov	al, [ebx]
	mov	ax, nfpk_ShiftP2[eax*2]
	neg	al
	neg	ah
	jmp	nf_xyc_shift

;----------------------------------------
	ALIGN 4
nf4:	; Near shift from previous buffer
	xor	eax, eax
	mov	ebx, bcomp
	inc	bcomp
	mov	al, [ebx]
	mov	ax, nfpk_ShiftP1[eax*2]
	jmp	nf_xyp_shift

;----------------------------------------
	ALIGN 4
nf5:	; Far shift from previous buffer
	mov	ax, [esi]
	add	esi, 2
nf_xyp_shift:
	xor	ebx, ebx
	mov	bl, ah
	shl	eax, 24
	sar	eax, 24-1
	add	eax, nfpk_ShiftY[ebx*4]
	add	eax, DiffBufPtrs
	jmp	nf_shift

;----------------------------------------
	ALIGN 4

nf6:	; Far shift from current buffer
	mov	ax, [esi]
	add	esi, 2
	jmp	nf_xyc_shift

;----------------------------------------
	ALIGN 4
nf_shift:
if 0 ;debug
	mov	ebx, 0
	jmp	nf_solid
endif
	mov	ebx, esi	; save esi
	lea	esi, [edi+eax]
	mov	edx, nf_width

	REPEAT 7
	  mov eax, [esi]
	  mov [edi], eax
	  mov eax, [esi+4]
	  mov [edi+4], eax
	  mov eax, [esi+8]
	  mov [edi+8], eax
	  mov eax, [esi+12]
	  mov [edi+12], eax
	  add	esi, edx
	  add	edi, edx
	ENDM
	mov eax, [esi]
	mov [edi], eax
	mov eax, [esi+4]
	mov [edi+4], eax
	mov eax, [esi+8]
	mov [edi+8], eax
	mov eax, [esi+12]
	mov [edi+12], eax

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	mov	esi, ebx	; restore esi
	retn

;----------------------------------------
	ALIGN 4
nf7:	;     8x8x1 (12 bytes)

	test	word ptr [esi], 08000h
	jnz	nf23

if 0 ;debug
	add	esi, 12
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov8
	lea	edx, byte ptr ds:nf7_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_11-nf7_11)], bl
	mov	[edx+(nf7_12-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_13-nf7_11)], bl
	mov	[edx+(nf7_14-nf7_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_21-nf7_11)], bl
	mov	[edx+(nf7_22-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_23-nf7_11)], bl
	mov	[edx+(nf7_24-nf7_11)], bh

	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_31-nf7_11)], bl
	mov	[edx+(nf7_32-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_33-nf7_11)], bl
	mov	[edx+(nf7_34-nf7_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_41-nf7_11)], bl
	mov	[edx+(nf7_42-nf7_11)], bh
	shr	ebx, 16
	mov	[edx+(nf7_43-nf7_11)], bl
	mov	[edx+(nf7_44-nf7_11)], bh

	lea	edx, [edx+(nf7_51-nf7_11)]

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_51-nf7_51)], bl
	mov	[edx+(nf7_52-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_53-nf7_51)], bl
	mov	[edx+(nf7_54-nf7_51)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_61-nf7_51)], bl
	mov	[edx+(nf7_62-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_63-nf7_51)], bl
	mov	[edx+(nf7_64-nf7_51)], bh

	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_71-nf7_51)], bl
	mov	[edx+(nf7_72-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_73-nf7_51)], bl
	mov	[edx+(nf7_74-nf7_51)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf7_81-nf7_51)], bl
	mov	[edx+(nf7_82-nf7_51)], bh
	shr	ebx, 16
	mov	[edx+(nf7_83-nf7_51)], bl
	mov	[edx+(nf7_84-nf7_51)], bh

	push	ebp
	push	esi
	; load ebx,edx,ecx,ebp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
if TRANS16
	Trans16	cx, esi+2
	shl	ecx, 16
	Trans16 cx, esi
else
	mov	ecx, [esi]
endif
	mov	esi,nf_width
	mov	edx, ecx

	ror	edx, 16
	mov	ebx, edx
	mov	bx, cx
	mov	ebp, ecx
	mov	bp, dx

	jmp	nf7_0		; flush prefetch
	ALIGN 4
nf7_0:
nf7_11:	mov	[ebp+0], ebx
nf7_12:	mov	[ebp+4], ebx
nf7_13:	mov	[ebp+8], ebx
nf7_14:	mov	[ebp+12], ebx
	add	edi, esi

nf7_21:	mov	[ebp+0], ebx
nf7_22:	mov	[ebp+4], ebx
nf7_23:	mov	[ebp+8], ebx
nf7_24:	mov	[ebp+12], ebx
	add	edi, esi

nf7_31:	mov	[ebp+0], ebx
nf7_32:	mov	[ebp+4], ebx
nf7_33:	mov	[ebp+8], ebx
nf7_34:	mov	[ebp+12], ebx
	add	edi, esi

nf7_41:	mov	[ebp+0], ebx
nf7_42:	mov	[ebp+4], ebx
nf7_43:	mov	[ebp+8], ebx
nf7_44:	mov	[ebp+12], ebx
	add	edi, esi

nf7_51:	mov	[ebp+0], ebx
nf7_52:	mov	[ebp+4], ebx
nf7_53:	mov	[ebp+8], ebx
nf7_54:	mov	[ebp+12], ebx
	add	edi, esi

nf7_61:	mov	[ebp+0], ebx
nf7_62:	mov	[ebp+4], ebx
nf7_63:	mov	[ebp+8], ebx
nf7_64:	mov	[ebp+12], ebx
	add	edi, esi

nf7_71:	mov	[ebp+0], ebx
nf7_72:	mov	[ebp+4], ebx
nf7_73:	mov	[ebp+8], ebx
nf7_74:	mov	[ebp+12], ebx
	add	edi, esi

nf7_81:	mov	[ebp+0], ebx
nf7_82:	mov	[ebp+4], ebx
nf7_83:	mov	[ebp+8], ebx
nf7_84:	mov	[ebp+12], ebx

	pop	esi
	pop	ebp
	add	esi, 12
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf7+16
nf23:	; low 4x4x1 (6 bytes)

if 0 ;debug
	add	esi, 6
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov4l
	lea	edx, byte ptr ds:nf23_11+1

	mov	al, [esi+4]
	and	al, 0fH
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_11-nf23_11)], bl
	mov	[edx+(nf23_12-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_13-nf23_11)], bl
	mov	[edx+(nf23_14-nf23_11)], bh

	mov	al, [esi+4]
	shr	al, 4
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_31-nf23_11)], bl
	mov	[edx+(nf23_32-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_33-nf23_11)], bl
	mov	[edx+(nf23_34-nf23_11)], bh


	mov	al, [esi+5]
	and	al, 0fH
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_51-nf23_11)], bl
	mov	[edx+(nf23_52-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_53-nf23_11)], bl
	mov	[edx+(nf23_54-nf23_11)], bh

	mov	al, [esi+5]
	shr	al, 4
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf23_71-nf23_11)], bl
	mov	[edx+(nf23_72-nf23_11)], bh
	shr	ebx, 16
	mov	[edx+(nf23_73-nf23_11)], bl
	mov	[edx+(nf23_74-nf23_11)], bh

	mov	edx, nf_width

	; load ebx,ecx with 00,11 color combinations
if TRANS16
	Trans16	cx, esi, 1
	shrd	ebx, ecx, 16
	mov	bx, cx
	Trans16 cx, esi+2
	shrd	eax, ecx, 16
	mov	ax, cx
	mov	ecx, eax
else
	mov	ebx, [esi]
	and	ebx, 07FFF7FFFh
	mov	ecx, ebx
	ror	ebx, 16
	xchg	bx,cx
endif

	jmp	nf23_0		; flush prefetch
	ALIGN 4
nf23_0:

nf23_11:mov	eax, ebx
	mov	[edi], eax
	mov	[edi+edx], eax
nf23_12:mov	eax, ebx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
nf23_13:mov	eax, ebx
	mov	[edi+8], eax
	mov	[edi+edx+8], eax
nf23_14:mov	eax, ebx
	mov	[edi+12], eax
	mov	[edi+edx+12], eax
	lea	edi, [edi+edx*2]

nf23_31:mov	eax, ebx
	mov	[edi], eax
	mov	[edi+edx], eax
nf23_32:mov	eax, ebx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
nf23_33:mov	eax, ebx
	mov	[edi+8], eax
	mov	[edi+edx+8], eax
nf23_34:mov	eax, ebx
	mov	[edi+12], eax
	mov	[edi+edx+12], eax
	lea	edi, [edi+edx*2]

nf23_51:mov	eax, ebx
	mov	[edi], eax
	mov	[edi+edx], eax
nf23_52:mov	eax, ebx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
nf23_53:mov	eax, ebx
	mov	[edi+8], eax
	mov	[edi+edx+8], eax
nf23_54:mov	eax, ebx
	mov	[edi+12], eax
	mov	[edi+edx+12], eax
	lea	edi, [edi+edx*2]

nf23_71:mov	eax, ebx
	mov	[edi], eax
	mov	[edi+edx], eax
nf23_72:mov	eax, ebx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax
nf23_73:mov	eax, ebx
	mov	[edi+8], eax
	mov	[edi+edx+8], eax
nf23_74:mov	eax, ebx
	mov	[edi+12], eax
	mov	[edi+edx+12], eax
	add	edi, edx

	sub	edi, nfpk_back_right
	add	esi, 6
	retn

;----------------------------------------
	ALIGN 4
nf8:	; 2x2 4x4x1 (24 bytes)

	test	word ptr [esi], 08000h
	jnz	nf24

if 0 ;debug
	add	esi, 24
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov8
	lea	edx, byte ptr ds:nf8_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_11-nf8_11)], bl
	mov	[edx+(nf8_12-nf8_11)], bh
	shr	ebx, 16
	mov	[edx+(nf8_13-nf8_11)], bl
	mov	[edx+(nf8_14-nf8_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_21-nf8_11)], bl
	mov	[edx+(nf8_22-nf8_11)], bh
	shr	ebx, 16
	mov	[edx+(nf8_23-nf8_11)], bl
	mov	[edx+(nf8_24-nf8_11)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_31-nf8_11)], bl
	mov	[edx+(nf8_32-nf8_11)], bh
	shr	ebx, 16
	mov	[edx+(nf8_33-nf8_11)], bl
	mov	[edx+(nf8_34-nf8_11)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_41-nf8_11)], bl
	mov	[edx+(nf8_42-nf8_11)], bh
	shr	ebx, 16
	mov	[edx+(nf8_43-nf8_11)], bl
	mov	[edx+(nf8_44-nf8_11)], bh

	add	edx, nf8_51-nf8_11

	mov	al, [esi+16]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_51-nf8_51)], bl
	mov	[edx+(nf8_52-nf8_51)], bh
	shr	ebx, 16
	mov	[edx+(nf8_53-nf8_51)], bl
	mov	[edx+(nf8_54-nf8_51)], bh

	mov	al, [esi+17]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_61-nf8_51)], bl
	mov	[edx+(nf8_62-nf8_51)], bh
	shr	ebx, 16
	mov	[edx+(nf8_63-nf8_51)], bl
	mov	[edx+(nf8_64-nf8_51)], bh


	mov	al, [esi+22]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_71-nf8_51)], bl
	mov	[edx+(nf8_72-nf8_51)], bh
	shr	ebx, 16
	mov	[edx+(nf8_73-nf8_51)], bl
	mov	[edx+(nf8_74-nf8_51)], bh

	mov	al, [esi+23]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf8_81-nf8_51)], bl
	mov	[edx+(nf8_82-nf8_51)], bh
	shr	ebx, 16
	mov	[edx+(nf8_83-nf8_51)], bl
	mov	[edx+(nf8_84-nf8_51)], bh


	push	ebp
	push	esi
	; load ebx,edx,ecx,ebp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
if TRANS16
	Trans16 cx, esi+18+2
	shl	ecx, 16
	Trans16 cx, esi+18
	push	ecx

	Trans16 cx, esi+12+2
	shl	ecx, 16
	Trans16 cx, esi+12
	push	ecx

	Trans16 cx, esi+6+2
	shl	ecx, 16
	Trans16 cx, esi+6
	push	ecx

	Trans16	cx, esi+2
	shl	ecx, 16
	Trans16 cx, esi
else
	mov	ecx, [esi]
endif
	mov	esi,nf_width
	mov	edx, ecx
	ror	edx, 16
	mov	ebx, edx
	mov	bx, cx
	mov	ebp, ecx
	mov	bp, dx

	jmp	nf8_0		; flush prefetch
	ALIGN 4
nf8_0:
nf8_11:	mov	[ebp+0], ebx
nf8_12:	mov	[ebp+4], ebx
	add	edi, esi
nf8_13:	mov	[ebp+0], ebx
nf8_14:	mov	[ebp+4], ebx
	add	edi, esi

nf8_21:	mov	[ebp+0], ebx
nf8_22:	mov	[ebp+4], ebx
	add	edi, esi
nf8_23:	mov	[ebp+0], ebx
nf8_24:	mov	[ebp+4], ebx
	add	edi, esi

if TRANS16
	pop	ecx
else
	mov	eax, [esp]
	mov	ecx, [eax+6]
endif
	mov	edx, ecx
	ror	edx, 16
	mov	ebx, edx
	mov	bx, cx
	mov	ebp, ecx
	mov	bp, dx


nf8_31:	mov	[ebp+0], ebx
nf8_32:	mov	[ebp+4], ebx
	add	edi, esi
nf8_33:	mov	[ebp+0], ebx
nf8_34:	mov	[ebp+4], ebx
	add	edi, esi

nf8_41:	mov	[ebp+0], ebx
nf8_42:	mov	[ebp+4], ebx
	add	edi, esi
nf8_43:	mov	[ebp+0], ebx
nf8_44:	mov	[ebp+4], ebx
	add	edi, esi

	lea	eax, [esi*8-8]
	sub	edi, eax

if TRANS16
	pop	ecx
else
	mov	eax, [esp]
	mov	ecx, [eax+12]
endif
	mov	edx, ecx
	ror	edx, 16
	mov	ebx, edx
	mov	bx, cx
	mov	ebp, ecx
	mov	bp, dx

nf8_51:	mov	[ebp+0], ebx
nf8_52:	mov	[ebp+4], ebx
	add	edi, esi
nf8_53:	mov	[ebp+0], ebx
nf8_54:	mov	[ebp+4], ebx
	add	edi, esi

nf8_61:	mov	[ebp+0], ebx
nf8_62:	mov	[ebp+4], ebx
	add	edi, esi
nf8_63:	mov	[ebp+0], ebx
nf8_64:	mov	[ebp+4], ebx
	add	edi, esi

if TRANS16
	pop	ecx
else
	mov	eax, [esp]
	mov	ecx, [eax+18]
endif
	mov	edx, ecx
	ror	edx, 16
	mov	ebx, edx
	mov	bx, cx
	mov	ebp, ecx
	mov	bp, dx

nf8_71:	mov	[ebp+0], ebx
nf8_72:	mov	[ebp+4], ebx
	add	edi, esi
nf8_73:	mov	[ebp+0], ebx
nf8_74:	mov	[ebp+4], ebx
	add	edi, esi

nf8_81:	mov	[ebp+0], ebx
nf8_82:	mov	[ebp+4], ebx
	add	edi, esi
nf8_83:	mov	[ebp+0], ebx
nf8_84:	mov	[ebp+4], ebx

	pop	esi
	pop	ebp
	add	esi, 24
	sub	edi, 8
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf8+16
nf24:	; 2x1 4x8x1 (16 bytes)

	test	word ptr [esi+8], 08000h
	jnz	nf40

if 0 ;debug
	add	esi, 16
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov8
	lea	edx, byte ptr ds:nf24_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_11-nf24_11)], bl
	mov	[edx+(nf24_12-nf24_11)], bh
	shr	ebx, 16
	mov	[edx+(nf24_13-nf24_11)], bl
	mov	[edx+(nf24_14-nf24_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_21-nf24_11)], bl
	mov	[edx+(nf24_22-nf24_11)], bh
	shr	ebx, 16
	mov	[edx+(nf24_23-nf24_11)], bl
	mov	[edx+(nf24_24-nf24_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_31-nf24_11)], bl
	mov	[edx+(nf24_32-nf24_11)], bh
	shr	ebx, 16
	mov	[edx+(nf24_33-nf24_11)], bl
	mov	[edx+(nf24_34-nf24_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_41-nf24_11)], bl
	mov	[edx+(nf24_42-nf24_11)], bh
	shr	ebx, 16
	mov	[edx+(nf24_43-nf24_11)], bl
	mov	[edx+(nf24_44-nf24_11)], bh

	add	edx, nf24_51-nf24_11

	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_51-nf24_51)], bl
	mov	[edx+(nf24_52-nf24_51)], bh
	shr	ebx, 16
	mov	[edx+(nf24_53-nf24_51)], bl
	mov	[edx+(nf24_54-nf24_51)], bh

	mov	al, [esi+13]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_61-nf24_51)], bl
	mov	[edx+(nf24_62-nf24_51)], bh
	shr	ebx, 16
	mov	[edx+(nf24_63-nf24_51)], bl
	mov	[edx+(nf24_64-nf24_51)], bh


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_71-nf24_51)], bl
	mov	[edx+(nf24_72-nf24_51)], bh
	shr	ebx, 16
	mov	[edx+(nf24_73-nf24_51)], bl
	mov	[edx+(nf24_74-nf24_51)], bh

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf24_81-nf24_51)], bl
	mov	[edx+(nf24_82-nf24_51)], bh
	shr	ebx, 16
	mov	[edx+(nf24_83-nf24_51)], bl
	mov	[edx+(nf24_84-nf24_51)], bh


	push	ebp
	push	esi
	; load ebx,edx,ecx,ebp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
if TRANS16
	Trans16	cx, esi+8+2
	shl	ecx, 16
	Trans16	cx, esi+8
	push	ecx

	Trans16	cx, esi+2
	shl	ecx, 16
	Trans16 cx, esi, 1
else
	mov	ecx, [esi]
	and	ecx, 07FFF7FFFh
endif
	mov	esi,nf_width
	mov	edx, ecx
	ror	edx, 16
	mov	ebx, edx
	mov	bx, cx
	mov	ebp, ecx
	mov	bp, dx

	jmp	nf24_0		; flush prefetch
	ALIGN 4
nf24_0:

nf24_11:mov	[ebp+0], ebx
nf24_12:mov	[ebp+4], ebx
	add	edi, esi
nf24_13:mov	[ebp+0], ebx
nf24_14:mov	[ebp+4], ebx
	add	edi, esi

nf24_21:mov	[ebp+0], ebx
nf24_22:mov	[ebp+4], ebx
	add	edi, esi
nf24_23:mov	[ebp+0], ebx
nf24_24:mov	[ebp+4], ebx
	add	edi, esi

nf24_31:mov	[ebp+0], ebx
nf24_32:mov	[ebp+4], ebx
	add	edi, esi
nf24_33:mov	[ebp+0], ebx
nf24_34:mov	[ebp+4], ebx
	add	edi, esi

nf24_41:mov	[ebp+0], ebx
nf24_42:mov	[ebp+4], ebx
	add	edi, esi
nf24_43:mov	[ebp+0], ebx
nf24_44:mov	[ebp+4], ebx
	add	edi, esi

	lea	eax, [esi*8-8]
	sub	edi, eax

if TRANS16
	pop	ecx
else
	mov	eax, [esp]
	mov	ecx, [eax+8]
endif
	mov	edx, ecx
	ror	edx, 16
	mov	ebx, edx
	mov	bx, cx
	mov	ebp, ecx
	mov	bp, dx

nf24_51:mov	[ebp+0], ebx
nf24_52:mov	[ebp+4], ebx
	add	edi, esi
nf24_53:mov	[ebp+0], ebx
nf24_54:mov	[ebp+4], ebx
	add	edi, esi

nf24_61:mov	[ebp+0], ebx
nf24_62:mov	[ebp+4], ebx
	add	edi, esi
nf24_63:mov	[ebp+0], ebx
nf24_64:mov	[ebp+4], ebx
	add	edi, esi

nf24_71:mov	[ebp+0], ebx
nf24_72:mov	[ebp+4], ebx
	add	edi, esi
nf24_73:mov	[ebp+0], ebx
nf24_74:mov	[ebp+4], ebx
	add	edi, esi

nf24_81:mov	[ebp+0], ebx
nf24_82:mov	[ebp+4], ebx
	add	edi, esi
nf24_83:mov	[ebp+0], ebx
nf24_84:mov	[ebp+4], ebx

	pop	esi
	pop	ebp
	add	esi, 16
	sub	edi, 8
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf8+32
nf40:	; 1x2 8x4x1 (16 bytes)

if 0 ;debug
	add	esi, 16
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov8
	lea	edx, byte ptr ds:nf40_11+1

	mov	al, [esi+4]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_11-nf40_11)], bl
	mov	[edx+(nf40_12-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_13-nf40_11)], bl
	mov	[edx+(nf40_14-nf40_11)], bh

	mov	al, [esi+5]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_21-nf40_11)], bl
	mov	[edx+(nf40_22-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_23-nf40_11)], bl
	mov	[edx+(nf40_24-nf40_11)], bh


	mov	al, [esi+6]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_31-nf40_11)], bl
	mov	[edx+(nf40_32-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_33-nf40_11)], bl
	mov	[edx+(nf40_34-nf40_11)], bh

	mov	al, [esi+7]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_41-nf40_11)], bl
	mov	[edx+(nf40_42-nf40_11)], bh
	shr	ebx, 16
	mov	[edx+(nf40_43-nf40_11)], bl
	mov	[edx+(nf40_44-nf40_11)], bh

	add	edx, nf40_51-nf40_11

	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_51-nf40_51)], bl
	mov	[edx+(nf40_52-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_53-nf40_51)], bl
	mov	[edx+(nf40_54-nf40_51)], bh

	mov	al, [esi+13]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_61-nf40_51)], bl
	mov	[edx+(nf40_62-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_63-nf40_51)], bl
	mov	[edx+(nf40_64-nf40_51)], bh


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_71-nf40_51)], bl
	mov	[edx+(nf40_72-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_73-nf40_51)], bl
	mov	[edx+(nf40_74-nf40_51)], bh

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf40_81-nf40_51)], bl
	mov	[edx+(nf40_82-nf40_51)], bh
	shr	ebx, 16
	mov	[edx+(nf40_83-nf40_51)], bl
	mov	[edx+(nf40_84-nf40_51)], bh


	push	ebp
	push	esi
	; load ebx,edx,ecx,ebp with 00,01,10,11 color combinations
	; (note that bits are read least significant first).
if TRANS16
	Trans16	cx, esi+8+2
	shl	ecx, 16
	Trans16 cx, esi+8, 1
	push	ecx

	Trans16	cx, esi+2
	shl	ecx, 16
	Trans16 cx, esi, 1
else
	mov	ecx, [esi]
	and	ecx, 07FFF7FFFh
endif
	mov	esi,nf_width
	mov	edx, ecx
	ror	edx, 16
	mov	ebx, edx
	mov	bx, cx
	mov	ebp, ecx
	mov	bp, dx

	jmp	nf40_0		; flush prefetch
	ALIGN 4
nf40_0:

nf40_11:mov	[ebp+0], ebx
nf40_12:mov	[ebp+4], ebx
nf40_13:mov	[ebp+8], ebx
nf40_14:mov	[ebp+12], ebx
	add	edi, esi

nf40_21:mov	[ebp+0], ebx
nf40_22:mov	[ebp+4], ebx
nf40_23:mov	[ebp+8], ebx
nf40_24:mov	[ebp+12], ebx
	add	edi, esi

nf40_31:mov	[ebp+0], ebx
nf40_32:mov	[ebp+4], ebx
nf40_33:mov	[ebp+8], ebx
nf40_34:mov	[ebp+12], ebx
	add	edi, esi

nf40_41:mov	[ebp+0], ebx
nf40_42:mov	[ebp+4], ebx
nf40_43:mov	[ebp+8], ebx
nf40_44:mov	[ebp+12], ebx
	add	edi, esi

if TRANS16
	pop	ecx
else
	mov	eax, [esp]
	mov	ecx, [eax+8]
	and	ecx, 07FFF7FFFh
endif
	mov	edx, ecx
	ror	edx, 16
	mov	ebx, edx
	mov	bx, cx
	mov	ebp, ecx
	mov	bp, dx

nf40_51:mov	[ebp+0], ebx
nf40_52:mov	[ebp+4], ebx
nf40_53:mov	[ebp+8], ebx
nf40_54:mov	[ebp+12], ebx
	add	edi, esi

nf40_61:mov	[ebp+0], ebx
nf40_62:mov	[ebp+4], ebx
nf40_63:mov	[ebp+8], ebx
nf40_64:mov	[ebp+12], ebx
	add	edi, esi

nf40_71:mov	[ebp+0], ebx
nf40_72:mov	[ebp+4], ebx
nf40_73:mov	[ebp+8], ebx
nf40_74:mov	[ebp+12], ebx
	add	edi, esi

nf40_81:mov	[ebp+0], ebx
nf40_82:mov	[ebp+4], ebx
nf40_83:mov	[ebp+8], ebx
nf40_84:mov	[ebp+12], ebx

	pop	esi
	pop	ebp
	add	esi, 16
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
nf9:	;     8x8x2 (24 bytes)

	test	word ptr [esi], 08000h
	jnz	nf41

	test	word ptr [esi+4], 08000h
	jnz	nf25

if 0 ;debug
	add	esi, 24
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov4
	lea	edx, byte ptr ds:nf9_11+2

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_11-nf9_11)], bh
	mov	[edx+(nf9_12-nf9_11)], bl
	shr	ebx, 16
	mov	[edx+(nf9_13-nf9_11)], bh
	mov	[edx+(nf9_14-nf9_11)], bl

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_15-nf9_11)], bh
	mov	[edx+(nf9_16-nf9_11)], bl
	shr	ebx, 16
	mov	[edx+(nf9_17-nf9_11)], bh
	mov	[edx+(nf9_18-nf9_11)], bl


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_21-nf9_11)], bh
	mov	[edx+(nf9_22-nf9_11)], bl
	shr	ebx, 16
	mov	[edx+(nf9_23-nf9_11)], bh
	mov	[edx+(nf9_24-nf9_11)], bl

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_25-nf9_11)], bh
	mov	[edx+(nf9_26-nf9_11)], bl
	shr	ebx, 16
	mov	[edx+(nf9_27-nf9_11)], bh
	mov	[edx+(nf9_28-nf9_11)], bl


	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_31-nf9_11)], bh
	mov	[edx+(nf9_32-nf9_11)], bl
	shr	ebx, 16
	mov	[edx+(nf9_33-nf9_11)], bh
	mov	[edx+(nf9_34-nf9_11)], bl

	mov	al, [esi+13]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_35-nf9_11)], bh
	mov	[edx+(nf9_36-nf9_11)], bl
	shr	ebx, 16
	mov	[edx+(nf9_37-nf9_11)], bh
	mov	[edx+(nf9_38-nf9_11)], bl


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_41-nf9_11)], bh
	mov	[edx+(nf9_42-nf9_11)], bl
	shr	ebx, 16
	mov	[edx+(nf9_43-nf9_11)], bh
	mov	[edx+(nf9_44-nf9_11)], bl

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_45-nf9_11)], bh
	mov	[edx+(nf9_46-nf9_11)], bl
	shr	ebx, 16
	mov	[edx+(nf9_47-nf9_11)], bh
	mov	[edx+(nf9_48-nf9_11)], bl


	lea	edx, [edx+(nf9_51-nf9_11)]

	mov	al, [esi+16]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_51-nf9_51)], bh
	mov	[edx+(nf9_52-nf9_51)], bl
	shr	ebx, 16
	mov	[edx+(nf9_53-nf9_51)], bh
	mov	[edx+(nf9_54-nf9_51)], bl

	mov	al, [esi+17]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_55-nf9_51)], bh
	mov	[edx+(nf9_56-nf9_51)], bl
	shr	ebx, 16
	mov	[edx+(nf9_57-nf9_51)], bh
	mov	[edx+(nf9_58-nf9_51)], bl


	mov	al, [esi+18]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_61-nf9_51)], bh
	mov	[edx+(nf9_62-nf9_51)], bl
	shr	ebx, 16
	mov	[edx+(nf9_63-nf9_51)], bh
	mov	[edx+(nf9_64-nf9_51)], bl

	mov	al, [esi+19]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_65-nf9_51)], bh
	mov	[edx+(nf9_66-nf9_51)], bl
	shr	ebx, 16
	mov	[edx+(nf9_67-nf9_51)], bh
	mov	[edx+(nf9_68-nf9_51)], bl


	mov	al, [esi+20]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_71-nf9_51)], bh 
	mov	[edx+(nf9_72-nf9_51)], bl
	shr	ebx, 16
	mov	[edx+(nf9_73-nf9_51)], bh
	mov	[edx+(nf9_74-nf9_51)], bl

	mov	al, [esi+21]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_75-nf9_51)], bh
	mov	[edx+(nf9_76-nf9_51)], bl
	shr	ebx, 16
	mov	[edx+(nf9_77-nf9_51)], bh
	mov	[edx+(nf9_78-nf9_51)], bl


	mov	al, [esi+22]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_81-nf9_51)], bh
	mov	[edx+(nf9_82-nf9_51)], bl
	shr	ebx, 16
	mov	[edx+(nf9_83-nf9_51)], bh
	mov	[edx+(nf9_84-nf9_51)], bl

	mov	al, [esi+23]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf9_85-nf9_51)], bh
	mov	[edx+(nf9_86-nf9_51)], bl
	shr	ebx, 16
	mov	[edx+(nf9_87-nf9_51)], bh
	mov	[edx+(nf9_88-nf9_51)], bl

	push	ebp
	push	esi
	; Load bx,dx,cx,bp with four colors
if TRANS16
	Trans16	bx, esi
	Trans16	dx, esi+2
	Trans16 cx, esi+4
	Trans16 bp, esi+6
else
	mov	bx, [esi]
	mov	dx, [esi+2]
	mov	cx, [esi+4]
	mov	bp, [esi+6]
endif
	mov	esi, nf_width

	jmp	nf9_0		; flush prefetch
	ALIGN 4
nf9_0:

nf9_11:	mov	ax, bx
	shl	eax, 16
nf9_12:	mov	ax, bx
	mov	[edi], eax
nf9_13:	mov	ax, bx
	shl	eax, 16
nf9_14:	mov	ax, bx
	mov	[edi+4], eax
nf9_15:	mov	ax, bx
	shl	eax, 16
nf9_16:	mov	ax, bx
	mov	[edi+8], eax
nf9_17:	mov	ax, bx
	shl	eax, 16
nf9_18:	mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf9_21:	mov	ax, bx
	shl	eax, 16
nf9_22:	mov	ax, bx
	mov	[edi], eax
nf9_23:	mov	ax, bx
	shl	eax, 16
nf9_24:	mov	ax, bx
	mov	[edi+4], eax
nf9_25:	mov	ax, bx
	shl	eax, 16
nf9_26:	mov	ax, bx
	mov	[edi+8], eax
nf9_27:	mov	ax, bx
	shl	eax, 16
nf9_28:	mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf9_31:	mov	ax, bx
	shl	eax, 16
nf9_32:	mov	ax, bx
	mov	[edi], eax
nf9_33:	mov	ax, bx
	shl	eax, 16
nf9_34:	mov	ax, bx
	mov	[edi+4], eax
nf9_35:	mov	ax, bx
	shl	eax, 16
nf9_36:	mov	ax, bx
	mov	[edi+8], eax
nf9_37:	mov	ax, bx
	shl	eax, 16
nf9_38:	mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf9_41:	mov	ax, bx
	shl	eax, 16
nf9_42:	mov	ax, bx
	mov	[edi], eax
nf9_43:	mov	ax, bx
	shl	eax, 16
nf9_44:	mov	ax, bx
	mov	[edi+4], eax
nf9_45:	mov	ax, bx
	shl	eax, 16
nf9_46:	mov	ax, bx
	mov	[edi+8], eax
nf9_47:	mov	ax, bx
	shl	eax, 16
nf9_48:	mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf9_51:	mov	ax, bx
	shl	eax, 16
nf9_52:	mov	ax, bx
	mov	[edi], eax
nf9_53:	mov	ax, bx
	shl	eax, 16
nf9_54:	mov	ax, bx
	mov	[edi+4], eax
nf9_55:	mov	ax, bx
	shl	eax, 16
nf9_56:	mov	ax, bx
	mov	[edi+8], eax
nf9_57:	mov	ax, bx
	shl	eax, 16
nf9_58:	mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf9_61:	mov	ax, bx
	shl	eax, 16
nf9_62:	mov	ax, bx
	mov	[edi], eax
nf9_63:	mov	ax, bx
	shl	eax, 16
nf9_64:	mov	ax, bx
	mov	[edi+4], eax
nf9_65:	mov	ax, bx
	shl	eax, 16
nf9_66:	mov	ax, bx
	mov	[edi+8], eax
nf9_67:	mov	ax, bx
	shl	eax, 16
nf9_68:	mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf9_71:	mov	ax, bx
	shl	eax, 16
nf9_72:	mov	ax, bx
	mov	[edi], eax
nf9_73:	mov	ax, bx
	shl	eax, 16
nf9_74:	mov	ax, bx
	mov	[edi+4], eax
nf9_75:	mov	ax, bx
	shl	eax, 16
nf9_76:	mov	ax, bx
	mov	[edi+8], eax
nf9_77:	mov	ax, bx
	shl	eax, 16
nf9_78:	mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf9_81:	mov	ax, bx
	shl	eax, 16
nf9_82:	mov	ax, bx
	mov	[edi], eax
nf9_83:	mov	ax, bx
	shl	eax, 16
nf9_84:	mov	ax, bx
	mov	[edi+4], eax
nf9_85:	mov	ax, bx
	shl	eax, 16
nf9_86:	mov	ax, bx
	mov	[edi+8], eax
nf9_87:	mov	ax, bx
	shl	eax, 16
nf9_88:	mov	ax, bx
	mov	[edi+12], eax

	pop	esi
	pop	ebp
	add	esi, 24
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
;nf9+16
nf25:	; low 4x4x2 (12 bytes)

if 0 ;debug
	add	esi, 12
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov4
	lea	edx, byte ptr ds:nf25_11+1

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_11-nf25_11)], bl
	mov	[edx+(nf25_12-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_13-nf25_11)], bl
	mov	[edx+(nf25_14-nf25_11)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_21-nf25_11)], bl
	mov	[edx+(nf25_22-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_23-nf25_11)], bl
	mov	[edx+(nf25_24-nf25_11)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_31-nf25_11)], bl
	mov	[edx+(nf25_32-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_33-nf25_11)], bl
	mov	[edx+(nf25_34-nf25_11)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf25_41-nf25_11)], bl
	mov	[edx+(nf25_42-nf25_11)], bh
	shr	ebx, 16
	mov	[edx+(nf25_43-nf25_11)], bl
	mov	[edx+(nf25_44-nf25_11)], bh

	push	ebp
	push	esi
	; Load ebx,edx,ecx,ebp with four colors, duplicated in high order.
if TRANS16
	Trans16	cx, esi
	shrd	ebx, ecx, 16
	mov	bx, cx
	Trans16	cx, esi+2
	shrd	edx, ecx, 16
	mov	dx, cx
	Trans16	cx, esi+4, 1
	shrd	eax, ecx, 16
	mov	ax, cx
	push	eax
	Trans16	cx, esi+6
	shrd	ebp, ecx, 16
	mov	bp, cx
	pop	ecx
else	
	mov	ax, [esi]
	shrd	ebx, eax, 16
	mov	bx, ax
	mov	ax, [esi+2]
	shrd	edx, eax, 16
	mov	dx, ax
	mov	ax, [esi+4]
	and	eax, 07fffh
	shrd	ecx, eax, 16
	mov	cx, ax
	mov	ax, [esi+6]
	shrd	ebp, eax, 16
	mov	bp, ax
endif
	mov	esi, nf_width

	jmp	nf25_0		; flush prefetch
	ALIGN 4
nf25_0:

nf25_11:mov	eax, ebx
	mov	[edi], eax
	mov	[edi+esi], eax
nf25_12:mov	eax, ebx
	mov	[edi+4], eax
	mov	[edi+esi+4], eax
nf25_13:mov	eax, ebx
	mov	[edi+8], eax
	mov	[edi+esi+8], eax
nf25_14:mov	eax, ebx
	mov	[edi+12], eax
	mov	[edi+esi+12], eax
	lea	edi, [edi+esi*2]

nf25_21:mov	eax, ebx
	mov	[edi], eax
	mov	[edi+esi], eax
nf25_22:mov	eax, ebx
	mov	[edi+4], eax
	mov	[edi+esi+4], eax
nf25_23:mov	eax, ebx
	mov	[edi+8], eax
	mov	[edi+esi+8], eax
nf25_24:mov	eax, ebx
	mov	[edi+12], eax
	mov	[edi+esi+12], eax
	lea	edi, [edi+esi*2]

nf25_31:mov	eax, ebx
	mov	[edi], eax
	mov	[edi+esi], eax
nf25_32:mov	eax, ebx
	mov	[edi+4], eax
	mov	[edi+esi+4], eax
nf25_33:mov	eax, ebx
	mov	[edi+8], eax
	mov	[edi+esi+8], eax
nf25_34:mov	eax, ebx
	mov	[edi+12], eax
	mov	[edi+esi+12], eax
	lea	edi, [edi+esi*2]

nf25_41:mov	eax, ebx
	mov	[edi], eax
	mov	[edi+esi], eax
nf25_42:mov	eax, ebx
	mov	[edi+4], eax
	mov	[edi+esi+4], eax
nf25_43:mov	eax, ebx
	mov	[edi+8], eax
	mov	[edi+esi+8], eax
nf25_44:mov	eax, ebx
	mov	[edi+12], eax
	mov	[edi+esi+12], eax

	add	edi, esi

	pop	esi
	pop	ebp
	add	esi, 12
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf9+32
nf41:	; low 4x8x2 (16 bytes)
	test	word ptr [esi+4], 08000h
	jnz	nf57

if 0 ;debug
	add	esi, 16
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov8
	lea	edx, byte ptr ds:nf41_11+1

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_11-nf41_11)], bl
	mov	[edx+(nf41_12-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_13-nf41_11)], bl
	mov	[edx+(nf41_14-nf41_11)], bh

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_21-nf41_11)], bl
	mov	[edx+(nf41_22-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_23-nf41_11)], bl
	mov	[edx+(nf41_24-nf41_11)], bh


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_31-nf41_11)], bl
	mov	[edx+(nf41_32-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_33-nf41_11)], bl
	mov	[edx+(nf41_34-nf41_11)], bh

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_41-nf41_11)], bl
	mov	[edx+(nf41_42-nf41_11)], bh
	shr	ebx, 16
	mov	[edx+(nf41_43-nf41_11)], bl
	mov	[edx+(nf41_44-nf41_11)], bh

	lea	edx, [edx+(nf41_51-nf41_11)]

	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_51-nf41_51)], bl
	mov	[edx+(nf41_52-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_53-nf41_51)], bl
	mov	[edx+(nf41_54-nf41_51)], bh

	mov	al, [esi+13]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_61-nf41_51)], bl
	mov	[edx+(nf41_62-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_63-nf41_51)], bl
	mov	[edx+(nf41_64-nf41_51)], bh


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_71-nf41_51)], bl
	mov	[edx+(nf41_72-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_73-nf41_51)], bl
	mov	[edx+(nf41_74-nf41_51)], bh

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf41_81-nf41_51)], bl
	mov	[edx+(nf41_82-nf41_51)], bh
	shr	ebx, 16
	mov	[edx+(nf41_83-nf41_51)], bl
	mov	[edx+(nf41_84-nf41_51)], bh

	push	ebp
	push	esi
	; Load ebx,edx,ecx,ebp with four colors, duplicated in high order.
if TRANS16
	Trans16	cx, esi, 1
	shrd	ebx, ecx, 16
	mov	bx, cx
	Trans16	cx, esi+2
	shrd	edx, ecx, 16
	mov	dx, cx
	Trans16	cx, esi+4
	shrd	eax, ecx, 16
	mov	ax, cx
	push	eax
	Trans16	cx, esi+6
	shrd	ebp, ecx, 16
	mov	bp, cx
	pop	ecx
else	
	mov	ax, [esi]
	and	eax, 07fffh
	shrd	ebx, eax, 16
	mov	bx, ax
	mov	ax, [esi+2]
	shrd	edx, eax, 16
	mov	dx, ax
	mov	ax, [esi+4]
	shrd	ecx, eax, 16
	mov	cx, ax
	mov	ax, [esi+6]
	shrd	ebp, eax, 16
	mov	bp, ax
endif
	mov	esi, nf_width

	jmp	nf41_0		; flush prefetch
	ALIGN 4
nf41_0:

nf41_11:mov	[ebp+0], ebx
nf41_12:mov	[ebp+4], ebx
nf41_13:mov	[ebp+8], ebx
nf41_14:mov	[ebp+12], ebx
	add	edi, esi

nf41_21:mov	[ebp+0], ebx
nf41_22:mov	[ebp+4], ebx
nf41_23:mov	[ebp+8], ebx
nf41_24:mov	[ebp+12], ebx
	add	edi, esi

nf41_31:mov	[ebp+0], ebx
nf41_32:mov	[ebp+4], ebx
nf41_33:mov	[ebp+8], ebx
nf41_34:mov	[ebp+12], ebx
	add	edi, esi

nf41_41:mov	[ebp+0], ebx
nf41_42:mov	[ebp+4], ebx
nf41_43:mov	[ebp+8], ebx
nf41_44:mov	[ebp+12], ebx
	add	edi, esi

nf41_51:mov	[ebp+0], ebx
nf41_52:mov	[ebp+4], ebx
nf41_53:mov	[ebp+8], ebx
nf41_54:mov	[ebp+12], ebx
	add	edi, esi

nf41_61:mov	[ebp+0], ebx
nf41_62:mov	[ebp+4], ebx
nf41_63:mov	[ebp+8], ebx
nf41_64:mov	[ebp+12], ebx
	add	edi, esi

nf41_71:mov	[ebp+0], ebx
nf41_72:mov	[ebp+4], ebx
nf41_73:mov	[ebp+8], ebx
nf41_74:mov	[ebp+12], ebx
	add	edi, esi

nf41_81:mov	[ebp+0], ebx
nf41_82:mov	[ebp+4], ebx
nf41_83:mov	[ebp+8], ebx
nf41_84:mov	[ebp+12], ebx

	pop	esi
	pop	ebp
	add	esi, 16
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf9+48
nf57:	; low 8x4x2 (16 bytes)
if 0 ;debug
	add	esi, 16
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov4
	lea	edx, byte ptr ds:nf57_11+2

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_11-nf57_11)], bh
	mov	[edx+(nf57_12-nf57_11)], bl
	shr	ebx, 16
	mov	[edx+(nf57_13-nf57_11)], bh
	mov	[edx+(nf57_14-nf57_11)], bl

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_15-nf57_11)], bh
	mov	[edx+(nf57_16-nf57_11)], bl
	shr	ebx, 16
	mov	[edx+(nf57_17-nf57_11)], bh
	mov	[edx+(nf57_18-nf57_11)], bl


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_21-nf57_11)], bh
	mov	[edx+(nf57_22-nf57_11)], bl
	shr	ebx, 16
	mov	[edx+(nf57_23-nf57_11)], bh
	mov	[edx+(nf57_24-nf57_11)], bl

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_25-nf57_11)], bh
	mov	[edx+(nf57_26-nf57_11)], bl
	shr	ebx, 16
	mov	[edx+(nf57_27-nf57_11)], bh
	mov	[edx+(nf57_28-nf57_11)], bl


	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_31-nf57_11)], bh
	mov	[edx+(nf57_32-nf57_11)], bl
	shr	ebx, 16
	mov	[edx+(nf57_33-nf57_11)], bh
	mov	[edx+(nf57_34-nf57_11)], bl

	mov	al, [esi+13]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_35-nf57_11)], bh
	mov	[edx+(nf57_36-nf57_11)], bl
	shr	ebx, 16
	mov	[edx+(nf57_37-nf57_11)], bh
	mov	[edx+(nf57_38-nf57_11)], bl


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_41-nf57_11)], bh
	mov	[edx+(nf57_42-nf57_11)], bl
	shr	ebx, 16
	mov	[edx+(nf57_43-nf57_11)], bh
	mov	[edx+(nf57_44-nf57_11)], bl

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf57_45-nf57_11)], bh
	mov	[edx+(nf57_46-nf57_11)], bl
	shr	ebx, 16
	mov	[edx+(nf57_47-nf57_11)], bh
	mov	[edx+(nf57_48-nf57_11)], bl

	push	ebp
	push	esi
	; Load bx,dx,cx,bp with four colors
if TRANS16
	Trans16	bx, esi, 1
	Trans16	dx, esi+2
	Trans16 cx, esi+4, 1
	Trans16 bp, esi+6
else
	mov	bx, [esi]
	and	ebx, 07fffh
	mov	dx, [esi+2]
	mov	cx, [esi+4]
	and	ecx, 07fffh
	mov	bp, [esi+6]
endif
	mov	esi, nf_width

	jmp	nf57_0		; flush prefetch
	ALIGN 4
nf57_0:

nf57_11:mov	ax, bx
	shl	eax, 16
nf57_12:mov	ax, bx
	mov	[edi], eax
	mov	[edi+esi], eax
nf57_13:mov	ax, bx
	shl	eax, 16
nf57_14:mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+esi+4], eax
nf57_15:mov	ax, bx
	shl	eax, 16
nf57_16:mov	ax, bx
	mov	[edi+8], eax
	mov	[edi+esi+8], eax
nf57_17:mov	ax, bx
	shl	eax, 16
nf57_18:mov	ax, bx
	mov	[edi+12], eax
	mov	[edi+esi+12], eax
	lea	edi, [edi+esi*2]

nf57_21:mov	ax, bx
	shl	eax, 16
nf57_22:mov	ax, bx
	mov	[edi], eax
	mov	[edi+esi], eax
nf57_23:mov	ax, bx
	shl	eax, 16
nf57_24:mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+esi+4], eax
nf57_25:mov	ax, bx
	shl	eax, 16
nf57_26:mov	ax, bx
	mov	[edi+8], eax
	mov	[edi+esi+8], eax
nf57_27:mov	ax, bx
	shl	eax, 16
nf57_28:mov	ax, bx
	mov	[edi+12], eax
	mov	[edi+esi+12], eax
	lea	edi, [edi+esi*2]

nf57_31:mov	ax, bx
	shl	eax, 16
nf57_32:mov	ax, bx
	mov	[edi], eax
	mov	[edi+esi], eax
nf57_33:mov	ax, bx
	shl	eax, 16
nf57_34:mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+esi+4], eax
nf57_35:mov	ax, bx
	shl	eax, 16
nf57_36:mov	ax, bx
	mov	[edi+8], eax
	mov	[edi+esi+8], eax
nf57_37:mov	ax, bx
	shl	eax, 16
nf57_38:mov	ax, bx
	mov	[edi+12], eax
	mov	[edi+esi+12], eax
	lea	edi, [edi+esi*2]

nf57_41:mov	ax, bx
	shl	eax, 16
nf57_42:mov	ax, bx
	mov	[edi], eax
	mov	[edi+esi], eax
nf57_43:mov	ax, bx
	shl	eax, 16
nf57_44:mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+esi+4], eax
nf57_45:mov	ax, bx
	shl	eax, 16
nf57_46:mov	ax, bx
	mov	[edi+8], eax
	mov	[edi+esi+8], eax
nf57_47:mov	ax, bx
	shl	eax, 16
nf57_48:mov	ax, bx
	mov	[edi+12], eax
	mov	[edi+esi+12], eax
	add	edi, esi

	pop	esi
	pop	ebp
	add	esi, 16
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
nf10:	; 2x2 4x4x2 (48 bytes)

	test	word ptr [esi], 08000h
	jnz	nf26

if 0 ;debug
	add	esi, 48
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov4
	lea	edx, byte ptr ds:nf10_11+2

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_11-nf10_11)], bh
	mov	[edx+(nf10_12-nf10_11)], bl
	shr	ebx, 16
	mov	[edx+(nf10_13-nf10_11)], bh
	mov	[edx+(nf10_14-nf10_11)], bl

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_15-nf10_11)], bh
	mov	[edx+(nf10_16-nf10_11)], bl
	shr	ebx, 16
	mov	[edx+(nf10_17-nf10_11)], bh
	mov	[edx+(nf10_18-nf10_11)], bl


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_21-nf10_11)], bh
	mov	[edx+(nf10_22-nf10_11)], bl
	shr	ebx, 16
	mov	[edx+(nf10_23-nf10_11)], bh
	mov	[edx+(nf10_24-nf10_11)], bl

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_25-nf10_11)], bh
	mov	[edx+(nf10_26-nf10_11)], bl
	shr	ebx, 16
	mov	[edx+(nf10_27-nf10_11)], bh
	mov	[edx+(nf10_28-nf10_11)], bl


	mov	al, [esi+20]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_31-nf10_11)], bh
	mov	[edx+(nf10_32-nf10_11)], bl
	shr	ebx, 16
	mov	[edx+(nf10_33-nf10_11)], bh
	mov	[edx+(nf10_34-nf10_11)], bl

	mov	al, [esi+21]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_35-nf10_11)], bh
	mov	[edx+(nf10_36-nf10_11)], bl
	shr	ebx, 16
	mov	[edx+(nf10_37-nf10_11)], bh
	mov	[edx+(nf10_38-nf10_11)], bl


	mov	al, [esi+22]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_41-nf10_11)], bh
	mov	[edx+(nf10_42-nf10_11)], bl
	shr	ebx, 16
	mov	[edx+(nf10_43-nf10_11)], bh
	mov	[edx+(nf10_44-nf10_11)], bl

	mov	al, [esi+23]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_45-nf10_11)], bh
	mov	[edx+(nf10_46-nf10_11)], bl
	shr	ebx, 16
	mov	[edx+(nf10_47-nf10_11)], bh
	mov	[edx+(nf10_48-nf10_11)], bl


	lea	edx, [edx+(nf10_51-nf10_11)]

	mov	al, [esi+32]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_51-nf10_51)], bh
	mov	[edx+(nf10_52-nf10_51)], bl
	shr	ebx, 16
	mov	[edx+(nf10_53-nf10_51)], bh
	mov	[edx+(nf10_54-nf10_51)], bl

	mov	al, [esi+33]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_55-nf10_51)], bh
	mov	[edx+(nf10_56-nf10_51)], bl
	shr	ebx, 16
	mov	[edx+(nf10_57-nf10_51)], bh
	mov	[edx+(nf10_58-nf10_51)], bl


	mov	al, [esi+34]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_61-nf10_51)], bh
	mov	[edx+(nf10_62-nf10_51)], bl
	shr	ebx, 16
	mov	[edx+(nf10_63-nf10_51)], bh
	mov	[edx+(nf10_64-nf10_51)], bl

	mov	al, [esi+35]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_65-nf10_51)], bh
	mov	[edx+(nf10_66-nf10_51)], bl
	shr	ebx, 16
	mov	[edx+(nf10_67-nf10_51)], bh
	mov	[edx+(nf10_68-nf10_51)], bl


	mov	al, [esi+44]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_71-nf10_51)], bh
	mov	[edx+(nf10_72-nf10_51)], bl
	shr	ebx, 16
	mov	[edx+(nf10_73-nf10_51)], bh
	mov	[edx+(nf10_74-nf10_51)], bl

	mov	al, [esi+45]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_75-nf10_51)], bh
	mov	[edx+(nf10_76-nf10_51)], bl
	shr	ebx, 16
	mov	[edx+(nf10_77-nf10_51)], bh
	mov	[edx+(nf10_78-nf10_51)], bl


	mov	al, [esi+46]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_81-nf10_51)], bh
	mov	[edx+(nf10_82-nf10_51)], bl
	shr	ebx, 16
	mov	[edx+(nf10_83-nf10_51)], bh
	mov	[edx+(nf10_84-nf10_51)], bl

	mov	al, [esi+47]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf10_85-nf10_51)], bh
	mov	[edx+(nf10_86-nf10_51)], bl
	shr	ebx, 16
	mov	[edx+(nf10_87-nf10_51)], bh
	mov	[edx+(nf10_88-nf10_51)], bl

	push	ebp
	push	esi
	; Load bx,dx,cx,bp with four colors
if TRANS16
	Trans16	bx, esi
	Trans16	dx, esi+2
	Trans16 cx, esi+4
	Trans16 bp, esi+6
else
	mov	bx, [esi]
	mov	dx, [esi+2]
	mov	cx, [esi+4]
	mov	bp, [esi+6]
endif
	mov	esi, nf_width

	jmp	nf10_0		; flush prefetch
	ALIGN 4
nf10_0:

nf10_11:mov	ax, bx
	shl	eax, 16
nf10_12:mov	ax, bx
	mov	[edi], eax
nf10_13:mov	ax, bx
	shl	eax, 16
nf10_14:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_15:mov	ax, bx
	shl	eax, 16
nf10_16:mov	ax, bx
	mov	[edi], eax
nf10_17:mov	ax, bx
	shl	eax, 16
nf10_18:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_21:mov	ax, bx
	shl	eax, 16
nf10_22:mov	ax, bx
	mov	[edi], eax
nf10_23:mov	ax, bx
	shl	eax, 16
nf10_24:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_25:mov	ax, bx
	shl	eax, 16
nf10_26:mov	ax, bx
	mov	[edi], eax
nf10_27:mov	ax, bx
	shl	eax, 16
nf10_28:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

	; Load bx,dx,cx,bp with four colors
if TRANS16
	mov	esi, [esp]
	Trans16	bx, esi+12
	Trans16	dx, esi+14
	Trans16 cx, esi+16
	Trans16 bp, esi+18
	mov	esi, nf_width
else
	mov	eax, [esp]
	mov	bx, [eax+12]
	mov	dx, [eax+14]
	mov	cx, [eax+16]
	mov	bp, [eax+18]
endif

nf10_31:mov	ax, bx
	shl	eax, 16
nf10_32:mov	ax, bx
	mov	[edi], eax
nf10_33:mov	ax, bx
	shl	eax, 16
nf10_34:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_35:mov	ax, bx
	shl	eax, 16
nf10_36:mov	ax, bx
	mov	[edi], eax
nf10_37:mov	ax, bx
	shl	eax, 16
nf10_38:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_41:mov	ax, bx
	shl	eax, 16
nf10_42:mov	ax, bx
	mov	[edi], eax
nf10_43:mov	ax, bx
	shl	eax, 16
nf10_44:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_45:mov	ax, bx
	shl	eax, 16
nf10_46:mov	ax, bx
	mov	[edi], eax
nf10_47:mov	ax, bx
	shl	eax, 16
nf10_48:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

	lea	eax, [esi*8-8]
	sub	edi, eax

	; Load bx,dx,cx,bp with four colors
if TRANS16
	mov	esi, [esp]
	Trans16	bx, esi+24
	Trans16	dx, esi+26
	Trans16 cx, esi+28
	Trans16 bp, esi+30
	mov	esi, nf_width
else
	mov	eax, [esp]
	mov	bx, [eax+24]
	mov	dx, [eax+26]
	mov	cx, [eax+28]
	mov	bp, [eax+30]
endif

nf10_51:mov	ax, bx
	shl	eax, 16
nf10_52:mov	ax, bx
	mov	[edi], eax
nf10_53:mov	ax, bx
	shl	eax, 16
nf10_54:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_55:mov	ax, bx
	shl	eax, 16
nf10_56:mov	ax, bx
	mov	[edi], eax
nf10_57:mov	ax, bx
	shl	eax, 16
nf10_58:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_61:mov	ax, bx
	shl	eax, 16
nf10_62:mov	ax, bx
	mov	[edi], eax
nf10_63:mov	ax, bx
	shl	eax, 16
nf10_64:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_65:mov	ax, bx
	shl	eax, 16
nf10_66:mov	ax, bx
	mov	[edi], eax
nf10_67:mov	ax, bx
	shl	eax, 16
nf10_68:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

	; Load bx,dx,cx,bp with four colors
if TRANS16
	mov	esi, [esp]
	Trans16	bx, esi+36
	Trans16	dx, esi+38
	Trans16 cx, esi+40
	Trans16 bp, esi+42
	mov	esi, nf_width
else
	mov	eax, [esp]
	mov	bx, [eax+36]
	mov	dx, [eax+38]
	mov	cx, [eax+40]
	mov	bp, [eax+42]
endif

nf10_71:mov	ax, bx
	shl	eax, 16
nf10_72:mov	ax, bx
	mov	[edi], eax
nf10_73:mov	ax, bx
	shl	eax, 16
nf10_74:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_75:mov	ax, bx
	shl	eax, 16
nf10_76:mov	ax, bx
	mov	[edi], eax
nf10_77:mov	ax, bx
	shl	eax, 16
nf10_78:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_81:mov	ax, bx
	shl	eax, 16
nf10_82:mov	ax, bx
	mov	[edi], eax
nf10_83:mov	ax, bx
	shl	eax, 16
nf10_84:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf10_85:mov	ax, bx
	shl	eax, 16
nf10_86:mov	ax, bx
	mov	[edi], eax
nf10_87:mov	ax, bx
	shl	eax, 16
nf10_88:mov	ax, bx
	mov	[edi+4], eax

	pop	esi
	pop	ebp
	add	esi, 48
	sub	edi, 8
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf10+16
nf26:	; 2x1 4x8x2 (32 bytes)

	test	word ptr [esi+16], 08000h
	jnz	nf42

if 0 ;debug
	add	esi, 32
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov4
	lea	edx, byte ptr ds:nf26_11+2

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_11-nf26_11)], bh
	mov	[edx+(nf26_12-nf26_11)], bl
	shr	ebx, 16
	mov	[edx+(nf26_13-nf26_11)], bh
	mov	[edx+(nf26_14-nf26_11)], bl

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_15-nf26_11)], bh
	mov	[edx+(nf26_16-nf26_11)], bl
	shr	ebx, 16
	mov	[edx+(nf26_17-nf26_11)], bh
	mov	[edx+(nf26_18-nf26_11)], bl


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_21-nf26_11)], bh
	mov	[edx+(nf26_22-nf26_11)], bl
	shr	ebx, 16
	mov	[edx+(nf26_23-nf26_11)], bh
	mov	[edx+(nf26_24-nf26_11)], bl

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_25-nf26_11)], bh
	mov	[edx+(nf26_26-nf26_11)], bl
	shr	ebx, 16
	mov	[edx+(nf26_27-nf26_11)], bh
	mov	[edx+(nf26_28-nf26_11)], bl

	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_31-nf26_11)], bh
	mov	[edx+(nf26_32-nf26_11)], bl
	shr	ebx, 16
	mov	[edx+(nf26_33-nf26_11)], bh
	mov	[edx+(nf26_34-nf26_11)], bl

	mov	al, [esi+13]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_35-nf26_11)], bh
	mov	[edx+(nf26_36-nf26_11)], bl
	shr	ebx, 16
	mov	[edx+(nf26_37-nf26_11)], bh
	mov	[edx+(nf26_38-nf26_11)], bl


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_41-nf26_11)], bh
	mov	[edx+(nf26_42-nf26_11)], bl
	shr	ebx, 16
	mov	[edx+(nf26_43-nf26_11)], bh
	mov	[edx+(nf26_44-nf26_11)], bl

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_45-nf26_11)], bh
	mov	[edx+(nf26_46-nf26_11)], bl
	shr	ebx, 16
	mov	[edx+(nf26_47-nf26_11)], bh
	mov	[edx+(nf26_48-nf26_11)], bl


	lea	edx, [edx+(nf26_51-nf26_11)]

	mov	al, [esi+24]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_51-nf26_51)], bh
	mov	[edx+(nf26_52-nf26_51)], bl
	shr	ebx, 16
	mov	[edx+(nf26_53-nf26_51)], bh
	mov	[edx+(nf26_54-nf26_51)], bl

	mov	al, [esi+25]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_55-nf26_51)], bh
	mov	[edx+(nf26_56-nf26_51)], bl
	shr	ebx, 16
	mov	[edx+(nf26_57-nf26_51)], bh
	mov	[edx+(nf26_58-nf26_51)], bl


	mov	al, [esi+26]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_61-nf26_51)], bh
	mov	[edx+(nf26_62-nf26_51)], bl
	shr	ebx, 16
	mov	[edx+(nf26_63-nf26_51)], bh
	mov	[edx+(nf26_64-nf26_51)], bl

	mov	al, [esi+27]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_65-nf26_51)], bh
	mov	[edx+(nf26_66-nf26_51)], bl
	shr	ebx, 16
	mov	[edx+(nf26_67-nf26_51)], bh
	mov	[edx+(nf26_68-nf26_51)], bl


	mov	al, [esi+28]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_71-nf26_51)], bh
	mov	[edx+(nf26_72-nf26_51)], bl
	shr	ebx, 16
	mov	[edx+(nf26_73-nf26_51)], bh
	mov	[edx+(nf26_74-nf26_51)], bl

	mov	al, [esi+29]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_75-nf26_51)], bh
	mov	[edx+(nf26_76-nf26_51)], bl
	shr	ebx, 16
	mov	[edx+(nf26_77-nf26_51)], bh
	mov	[edx+(nf26_78-nf26_51)], bl


	mov	al, [esi+30]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_81-nf26_51)], bh
	mov	[edx+(nf26_82-nf26_51)], bl
	shr	ebx, 16
	mov	[edx+(nf26_83-nf26_51)], bh
	mov	[edx+(nf26_84-nf26_51)], bl

	mov	al, [esi+31]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf26_85-nf26_51)], bh
	mov	[edx+(nf26_86-nf26_51)], bl
	shr	ebx, 16
	mov	[edx+(nf26_87-nf26_51)], bh
	mov	[edx+(nf26_88-nf26_51)], bl

	push	ebp
	push	esi
	; Load bx,dx,cx,bp with four colors
if TRANS16
	Trans16	bx, esi, 1
	Trans16	dx, esi+2
	Trans16 cx, esi+4
	Trans16 bp, esi+6
else
	mov	bx, [esi]
	and	ebx, 07fffh
	mov	dx, [esi+2]
	mov	cx, [esi+4]
	mov	bp, [esi+6]
endif
	mov	esi, nf_width

	jmp	nf26_0		; flush prefetch
	ALIGN 4
nf26_0:

nf26_11:mov	ax, bx
	shl	eax, 16
nf26_12:mov	ax, bx
	mov	[edi], eax
nf26_13:mov	ax, bx
	shl	eax, 16
nf26_14:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_15:mov	ax, bx
	shl	eax, 16
nf26_16:mov	ax, bx
	mov	[edi], eax
nf26_17:mov	ax, bx
	shl	eax, 16
nf26_18:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_21:mov	ax, bx
	shl	eax, 16
nf26_22:mov	ax, bx
	mov	[edi], eax
nf26_23:mov	ax, bx
	shl	eax, 16
nf26_24:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_25:mov	ax, bx
	shl	eax, 16
nf26_26:mov	ax, bx
	mov	[edi], eax
nf26_27:mov	ax, bx
	shl	eax, 16
nf26_28:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_31:mov	ax, bx
	shl	eax, 16
nf26_32:mov	ax, bx
	mov	[edi], eax
nf26_33:mov	ax, bx
	shl	eax, 16
nf26_34:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_35:mov	ax, bx
	shl	eax, 16
nf26_36:mov	ax, bx
	mov	[edi], eax
nf26_37:mov	ax, bx
	shl	eax, 16
nf26_38:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_41:mov	ax, bx
	shl	eax, 16
nf26_42:mov	ax, bx
	mov	[edi], eax
nf26_43:mov	ax, bx
	shl	eax, 16
nf26_44:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_45:mov	ax, bx
	shl	eax, 16
nf26_46:mov	ax, bx
	mov	[edi], eax
nf26_47:mov	ax, bx
	shl	eax, 16
nf26_48:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

	lea	eax, [esi*8-8]
	sub	edi, eax

	; Load bx,dx,cx,bp with four colors
if TRANS16
	mov	esi, [esp]
	Trans16	bx, esi+16
	Trans16	dx, esi+18
	Trans16 cx, esi+20
	Trans16 bp, esi+22
	mov	esi, nf_width
else
	mov	eax, [esp]
	mov	bx, [eax+16]
	mov	dx, [eax+18]
	mov	cx, [eax+20]
	mov	bp, [eax+22]
endif

nf26_51:mov	ax, bx
	shl	eax, 16
nf26_52:mov	ax, bx
	mov	[edi], eax
nf26_53:mov	ax, bx
	shl	eax, 16
nf26_54:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_55:mov	ax, bx
	shl	eax, 16
nf26_56:mov	ax, bx
	mov	[edi], eax
nf26_57:mov	ax, bx
	shl	eax, 16
nf26_58:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_61:mov	ax, bx
	shl	eax, 16
nf26_62:mov	ax, bx
	mov	[edi], eax
nf26_63:mov	ax, bx
	shl	eax, 16
nf26_64:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_65:mov	ax, bx
	shl	eax, 16
nf26_66:mov	ax, bx
	mov	[edi], eax
nf26_67:mov	ax, bx
	shl	eax, 16
nf26_68:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_71:mov	ax, bx
	shl	eax, 16
nf26_72:mov	ax, bx
	mov	[edi], eax
nf26_73:mov	ax, bx
	shl	eax, 16
nf26_74:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_75:mov	ax, bx
	shl	eax, 16
nf26_76:mov	ax, bx
	mov	[edi], eax
nf26_77:mov	ax, bx
	shl	eax, 16
nf26_78:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_81:mov	ax, bx
	shl	eax, 16
nf26_82:mov	ax, bx
	mov	[edi], eax
nf26_83:mov	ax, bx
	shl	eax, 16
nf26_84:mov	ax, bx
	mov	[edi+4], eax
	add	edi, esi

nf26_85:mov	ax, bx
	shl	eax, 16
nf26_86:mov	ax, bx
	mov	[edi], eax
nf26_87:mov	ax, bx
	shl	eax, 16
nf26_88:mov	ax, bx
	mov	[edi+4], eax

	pop	esi
	pop	ebp
	add	esi, 32
	sub	edi, 8
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
;nf10+32
nf42:	; 1x2 8x4x2 (32 bytes)

if 0 ;debug
	add	esi, 32
	mov	ebx, 0
	jmp	nf_solid
endif
	xor	eax, eax
	lea	ecx, nfhpk_mov4
	lea	edx, byte ptr ds:nf42_11+2

	mov	al, [esi+8]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_11-nf42_11)], bh
	mov	[edx+(nf42_12-nf42_11)], bl
	shr	ebx, 16
	mov	[edx+(nf42_13-nf42_11)], bh
	mov	[edx+(nf42_14-nf42_11)], bl

	mov	al, [esi+9]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_15-nf42_11)], bh
	mov	[edx+(nf42_16-nf42_11)], bl
	shr	ebx, 16
	mov	[edx+(nf42_17-nf42_11)], bh
	mov	[edx+(nf42_18-nf42_11)], bl


	mov	al, [esi+10]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_21-nf42_11)], bh
	mov	[edx+(nf42_22-nf42_11)], bl
	shr	ebx, 16
	mov	[edx+(nf42_23-nf42_11)], bh
	mov	[edx+(nf42_24-nf42_11)], bl

	mov	al, [esi+11]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_25-nf42_11)], bh
	mov	[edx+(nf42_26-nf42_11)], bl
	shr	ebx, 16
	mov	[edx+(nf42_27-nf42_11)], bh
	mov	[edx+(nf42_28-nf42_11)], bl


	mov	al, [esi+12]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_31-nf42_11)], bh
	mov	[edx+(nf42_32-nf42_11)], bl
	shr	ebx, 16
	mov	[edx+(nf42_33-nf42_11)], bh
	mov	[edx+(nf42_34-nf42_11)], bl

	mov	al, [esi+13]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_35-nf42_11)], bh
	mov	[edx+(nf42_36-nf42_11)], bl
	shr	ebx, 16
	mov	[edx+(nf42_37-nf42_11)], bh
	mov	[edx+(nf42_38-nf42_11)], bl


	mov	al, [esi+14]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_41-nf42_11)], bh
	mov	[edx+(nf42_42-nf42_11)], bl
	shr	ebx, 16
	mov	[edx+(nf42_43-nf42_11)], bh
	mov	[edx+(nf42_44-nf42_11)], bl

	mov	al, [esi+15]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_45-nf42_11)], bh
	mov	[edx+(nf42_46-nf42_11)], bl
	shr	ebx, 16
	mov	[edx+(nf42_47-nf42_11)], bh
	mov	[edx+(nf42_48-nf42_11)], bl


	lea	edx, [edx+(nf42_51-nf42_11)]

	mov	al, [esi+24]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_51-nf42_51)], bh
	mov	[edx+(nf42_52-nf42_51)], bl
	shr	ebx, 16
	mov	[edx+(nf42_53-nf42_51)], bh
	mov	[edx+(nf42_54-nf42_51)], bl

	mov	al, [esi+25]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_55-nf42_51)], bh
	mov	[edx+(nf42_56-nf42_51)], bl
	shr	ebx, 16
	mov	[edx+(nf42_57-nf42_51)], bh
	mov	[edx+(nf42_58-nf42_51)], bl


	mov	al, [esi+26]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_61-nf42_51)], bh
	mov	[edx+(nf42_62-nf42_51)], bl
	shr	ebx, 16
	mov	[edx+(nf42_63-nf42_51)], bh
	mov	[edx+(nf42_64-nf42_51)], bl

	mov	al, [esi+27]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_65-nf42_51)], bh
	mov	[edx+(nf42_66-nf42_51)], bl
	shr	ebx, 16
	mov	[edx+(nf42_67-nf42_51)], bh
	mov	[edx+(nf42_68-nf42_51)], bl


	mov	al, [esi+28]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_71-nf42_51)], bh
	mov	[edx+(nf42_72-nf42_51)], bl
	shr	ebx, 16
	mov	[edx+(nf42_73-nf42_51)], bh
	mov	[edx+(nf42_74-nf42_51)], bl

	mov	al, [esi+29]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_75-nf42_51)], bh
	mov	[edx+(nf42_76-nf42_51)], bl
	shr	ebx, 16
	mov	[edx+(nf42_77-nf42_51)], bh
	mov	[edx+(nf42_78-nf42_51)], bl


	mov	al, [esi+30]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_81-nf42_51)], bh
	mov	[edx+(nf42_82-nf42_51)], bl
	shr	ebx, 16
	mov	[edx+(nf42_83-nf42_51)], bh
	mov	[edx+(nf42_84-nf42_51)], bl

	mov	al, [esi+31]
	mov	ebx, [ecx+eax*4]
	mov	[edx+(nf42_85-nf42_51)], bh
	mov	[edx+(nf42_86-nf42_51)], bl
	shr	ebx, 16
	mov	[edx+(nf42_87-nf42_51)], bh
	mov	[edx+(nf42_88-nf42_51)], bl

	push	ebp
	push	esi
	; Load bx,dx,cx,bp with four colors
if TRANS16
	Trans16	bx, esi, 1
	Trans16	dx, esi+2
	Trans16 cx, esi+4
	Trans16 bp, esi+6
else
	mov	bx, [esi]
	and	ebx, 07fffh
	mov	dx, [esi+2]
	mov	cx, [esi+4]
	mov	bp, [esi+6]
endif
	mov	esi, nf_width

	jmp	nf42_0		; flush prefetch
	ALIGN 4
nf42_0:

nf42_11:mov	ax, bx
	shl	eax, 16
nf42_12:mov	ax, bx
	mov	[edi], eax
nf42_13:mov	ax, bx
	shl	eax, 16
nf42_14:mov	ax, bx
	mov	[edi+4], eax
nf42_15:mov	ax, bx
	shl	eax, 16
nf42_16:mov	ax, bx
	mov	[edi+8], eax
nf42_17:mov	ax, bx
	shl	eax, 16
nf42_18:mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf42_21:mov	ax, bx
	shl	eax, 16
nf42_22:mov	ax, bx
	mov	[edi], eax
nf42_23:mov	ax, bx
	shl	eax, 16
nf42_24:mov	ax, bx
	mov	[edi+4], eax
nf42_25:mov	ax, bx
	shl	eax, 16
nf42_26:mov	ax, bx
	mov	[edi+8], eax
nf42_27:mov	ax, bx
	shl	eax, 16
nf42_28:mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf42_31:mov	ax, bx
	shl	eax, 16
nf42_32:mov	ax, bx
	mov	[edi], eax
nf42_33:mov	ax, bx
	shl	eax, 16
nf42_34:mov	ax, bx
	mov	[edi+4], eax
nf42_35:mov	ax, bx
	shl	eax, 16
nf42_36:mov	ax, bx
	mov	[edi+8], eax
nf42_37:mov	ax, bx
	shl	eax, 16
nf42_38:mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf42_41:mov	ax, bx
	shl	eax, 16
nf42_42:mov	ax, bx
	mov	[edi], eax
nf42_43:mov	ax, bx
	shl	eax, 16
nf42_44:mov	ax, bx
	mov	[edi+4], eax
nf42_45:mov	ax, bx
	shl	eax, 16
nf42_46:mov	ax, bx
	mov	[edi+8], eax
nf42_47:mov	ax, bx
	shl	eax, 16
nf42_48:mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

	; Load bx,dx,cx,bp with four colors
if TRANS16
	mov	esi, [esp]
	Trans16	bx, esi+16, 1
	Trans16	dx, esi+18
	Trans16 cx, esi+20
	Trans16 bp, esi+22
	mov	esi, nf_width
else
	mov	eax, [esp]
	mov	bx, [eax+16]
	and	ebx, 07fffh
	mov	dx, [eax+18]
	mov	cx, [eax+20]
	mov	bp, [eax+22]
endif

nf42_51:mov	ax, bx
	shl	eax, 16
nf42_52:mov	ax, bx
	mov	[edi], eax
nf42_53:mov	ax, bx
	shl	eax, 16
nf42_54:mov	ax, bx
	mov	[edi+4], eax
nf42_55:mov	ax, bx
	shl	eax, 16
nf42_56:mov	ax, bx
	mov	[edi+8], eax
nf42_57:mov	ax, bx
	shl	eax, 16
nf42_58:mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf42_61:mov	ax, bx
	shl	eax, 16
nf42_62:mov	ax, bx
	mov	[edi], eax
nf42_63:mov	ax, bx
	shl	eax, 16
nf42_64:mov	ax, bx
	mov	[edi+4], eax
nf42_65:mov	ax, bx
	shl	eax, 16
nf42_66:mov	ax, bx
	mov	[edi+8], eax
nf42_67:mov	ax, bx
	shl	eax, 16
nf42_68:mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf42_71:mov	ax, bx
	shl	eax, 16
nf42_72:mov	ax, bx
	mov	[edi], eax
nf42_73:mov	ax, bx
	shl	eax, 16
nf42_74:mov	ax, bx
	mov	[edi+4], eax
nf42_75:mov	ax, bx
	shl	eax, 16
nf42_76:mov	ax, bx
	mov	[edi+8], eax
nf42_77:mov	ax, bx
	shl	eax, 16
nf42_78:mov	ax, bx
	mov	[edi+12], eax
	add	edi, esi

nf42_81:mov	ax, bx
	shl	eax, 16
nf42_82:mov	ax, bx
	mov	[edi], eax
nf42_83:mov	ax, bx
	shl	eax, 16
nf42_84:mov	ax, bx
	mov	[edi+4], eax
nf42_85:mov	ax, bx
	shl	eax, 16
nf42_86:mov	ax, bx
	mov	[edi+8], eax
nf42_87:mov	ax, bx
	shl	eax, 16
nf42_88:mov	ax, bx
	mov	[edi+12], eax

	pop	esi
	pop	ebp
	add	esi, 32
	sub	edi, nfpk_back_right
	retn

;----------------------------------------
	ALIGN 4
nf11:	;     8x8x16 (128 bytes)
if 0 ;debug
	add	esi, 128
	mov	ebx, 0
	jmp	nf_solid
endif
	mov	edx, nf_width

if TRANS16

Trans16Blk	MACRO idx
	Trans16	bx, idx
	mov	[edi], bx
	Trans16	bx, idx+2
	mov	[edi+2], bx
	Trans16 bx, idx+4
	mov	[edi+4], bx
	Trans16 bx, idx+6
	mov	[edi+6], bx
	Trans16 bx, idx+8
	mov	[edi+8], bx
	Trans16 bx, idx+10
	mov	[edi+10], bx
	Trans16 bx, idx+12
	mov	[edi+12], bx
	Trans16 bx, idx+14
	mov	[edi+14], bx
	ENDM

	Trans16Blk	esi	;0
	add	edi, edx
	Trans16Blk	esi+16	;1
	add	edi, edx
	Trans16Blk	esi+32	;2
	add	edi, edx
	Trans16Blk	esi+48	;3
	add	edi, edx
	Trans16Blk	esi+64	;4
	add	edi, edx
	Trans16Blk	esi+80	;5
	add	edi, edx
	Trans16Blk	esi+96	;6
	add	edi, edx
	Trans16Blk	esi+112	;7
else
	mov eax, [esi]		;0
	mov [edi], eax
	mov eax, [esi+4]
	mov [edi+4], eax
	mov eax, [esi+8]
	mov [edi+8], eax
	mov eax, [esi+12]
	mov [edi+12], eax
	add	edi, edx
	mov eax, [esi+16]	;1
	mov [edi], eax
	mov eax, [esi+20]
	mov [edi+4], eax
	mov eax, [esi+24]
	mov [edi+8], eax
	mov eax, [esi+28]
	mov [edi+12], eax
	add	edi, edx
	mov eax, [esi+32]	;2
	mov [edi], eax
	mov eax, [esi+36]
	mov [edi+4], eax
	mov eax, [esi+40]
	mov [edi+8], eax
	mov eax, [esi+44]
	mov [edi+12], eax
	add	edi, edx
	mov eax, [esi+48]	;3
	mov [edi], eax
	mov eax, [esi+52]
	mov [edi+4], eax
	mov eax, [esi+56]
	mov [edi+8], eax
	mov eax, [esi+60]
	mov [edi+12], eax
	add	edi, edx
	mov eax, [esi+64]	;4
	mov [edi], eax
	mov eax, [esi+68]
	mov [edi+4], eax
	mov eax, [esi+72]
	mov [edi+8], eax
	mov eax, [esi+76]
	mov [edi+12], eax
	add	edi, edx
	mov eax, [esi+80]	;5
	mov [edi], eax
	mov eax, [esi+84]
	mov [edi+4], eax
	mov eax, [esi+88]
	mov [edi+8], eax
	mov eax, [esi+92]
	mov [edi+12], eax
	add	edi, edx
	mov eax, [esi+96]	;6
	mov [edi], eax
	mov eax, [esi+100]
	mov [edi+4], eax
	mov eax, [esi+104]
	mov [edi+8], eax
	mov eax, [esi+108]
	mov [edi+12], eax
	add	edi, edx
	mov eax, [esi+112]	;7
	mov [edi], eax
	mov eax, [esi+116]
	mov [edi+4], eax
	mov eax, [esi+120]
	mov [edi+8], eax
	mov eax, [esi+124]
	mov [edi+12], eax
endif

	add	esi, 128
	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8
	retn

;----------------------------------------
	ALIGN 4
nf12:	; low 4x4x16 (32 bytes)
if 0 ;debug
	add	esi, 32
	mov	ebx, 0
	jmp	nf_solid
endif
	mov	edx, nf_width

	Trans16	bx, esi
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi], eax
	mov	[edi+edx], eax

	Trans16	bx, esi+2
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax

	Trans16	bx, esi+4
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+8], eax
	mov	[edi+edx+8], eax

	Trans16	bx, esi+6
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+12], eax
	mov	[edi+edx+12], eax

	lea	edi, [edi+edx*2]

	Trans16	bx, esi+8
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi], eax
	mov	[edi+edx], eax

	Trans16	bx, esi+10
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax

	Trans16	bx, esi+12
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+8], eax
	mov	[edi+edx+8], eax

	Trans16	bx, esi+14
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+12], eax
	mov	[edi+edx+12], eax

	lea	edi, [edi+edx*2]

	Trans16	bx, esi+16
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi], eax
	mov	[edi+edx], eax

	Trans16	bx, esi+18
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax

	Trans16	bx, esi+20
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+8], eax
	mov	[edi+edx+8], eax

	Trans16	bx, esi+22
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+12], eax
	mov	[edi+edx+12], eax

	lea	edi, [edi+edx*2]

	Trans16	bx, esi+24
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi], eax
	mov	[edi+edx], eax

	Trans16	bx, esi+26
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+4], eax
	mov	[edi+edx+4], eax

	Trans16	bx, esi+28
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+8], eax
	mov	[edi+edx+8], eax

	Trans16	bx, esi+30
	shrd	eax, ebx, 16
	mov	ax, bx
	mov	[edi+12], eax
	mov	[edi+edx+12], eax

	add	edi, edx

	sub	edi, nfpk_back_right
	add	esi, 32
	retn

;----------------------------------------
	ALIGN 4
nf13:	; 2x2 4x4x0 (8 bytes)
if 0 ;debug
	add	esi, 8
	mov	ebx, 0
	jmp	nf_solid
endif
	mov	edx, nf_width

if TRANS16
	Trans16	cx, esi
	shrd	ebx, ecx, 16
	mov	bx, cx

	Trans16	cx, esi+2
	shrd	eax, ecx, 16
	mov	ax, cx
	mov	ecx, eax
else
	mov	ax, [esi]
	shrd	ebx, eax, 16
	mov	bx, ax

	mov	ax, [esi+2]
	shrd	ecx, eax, 16
	mov	cx, ax
endif

	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ecx
	mov	[edi+12], ecx
	mov	[edi+edx], ebx
	mov	[edi+edx+4], ebx
	mov	[edi+edx+8], ecx
	mov	[edi+edx+12], ecx
	lea	edi, [edi+edx*2]
	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ecx
	mov	[edi+12], ecx
	mov	[edi+edx], ebx
	mov	[edi+edx+4], ebx
	mov	[edi+edx+8], ecx
	mov	[edi+edx+12], ecx
	lea	edi, [edi+edx*2]

if TRANS16
	Trans16	cx, esi+4
	shrd	ebx, ecx, 16
	mov	bx, cx

	Trans16	cx, esi+6
	shrd	eax, ecx, 16
	mov	ax, cx
	mov	ecx, eax
else
	mov	ax, [esi+4]
	shrd	ebx, eax, 16
	mov	bx, ax

	mov	ax, [esi+6]
	shrd	ecx, eax, 16
	mov	cx, ax
endif

	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ecx
	mov	[edi+12], ecx
	mov	[edi+edx], ebx
	mov	[edi+edx+4], ebx
	mov	[edi+edx+8], ecx
	mov	[edi+edx+12], ecx
	lea	edi, [edi+edx*2]
	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ecx
	mov	[edi+12], ecx
	mov	[edi+edx], ebx
	mov	[edi+edx+4], ebx
	mov	[edi+edx+8], ecx
	mov	[edi+edx+12], ecx
	add	edi, edx

	sub	edi, nfpk_back_right
	add	esi, 8

	retn

;----------------------------------------
	ALIGN 4
nf14:	;     8x8x0 (2 bytes)
	Trans16	cx, esi
	add	esi, 2
	shrd	ebx, ecx, 16
	mov	bx, cx

nf_solid:
	mov	edx, nf_width

	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ebx
	mov	[edi+12], ebx
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ebx
	mov	[edi+12], ebx
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ebx
	mov	[edi+12], ebx
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ebx
	mov	[edi+12], ebx
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ebx
	mov	[edi+12], ebx
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ebx
	mov	[edi+12], ebx
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ebx
	mov	[edi+12], ebx
	add	edi, edx
	mov	[edi], ebx
	mov	[edi+4], ebx
	mov	[edi+8], ebx
	mov	[edi+12], ebx

	sub	edi, nfpk_back_right	; (SHEIGHT-1)*width+8

	retn

;----------------------------------------
	ALIGN 4
nf15:	; unused
	retn

nfHPkDecomp ENDP

endif	; PKDATA

;---------------------------------------------------------------------
; ShowFrame
;------------

EXTERN sf_LineWidth: DWORD	;unsigned sf_LineWidth;	// Distance between lines in memory

; Banked screen parameters
EXTERN sf_SetBank: PTRPROC	;unsigned long sf_SetBank;
EXTERN sf_WinGran: DWORD	;unsigned sf_WinGran;
EXTERN sf_WinSize: DWORD	;unsigned long sf_WinSize;
EXTERN sf_WinGranPerSize: DWORD	;unsigned sf_WinGranPerSize;
;{sf_WriteWinPtr and sf_WriteWinLimit replace sf_WriteWinSeg, see mveliba.asm}
EXTERN sf_WriteWinPtr: PTRBYTE	;unsigned char *sf_WriteWinPtr;
EXTERN sf_WriteWinLimit: PTRBYTE ;unsigned char *WriteWinLimit;
EXTERN sf_WriteWin: DWORD	;unsigned sf_WriteWin;

if SCALING
EXTERN opt_hscale_step: DWORD
EXTERN opt_hscale_adj: DWORD
endif

;void mve_ShowFrameField(
;	      unsigned char *buf, unsigned bufw, unsigned bufh,
;	      unsigned sx, unsigned sy, unsigned w, unsigned h,
;	      unsigned dstx, unsigned dsty, unsigned field)

mve_ShowFrameField PROC USES ESI EDI EBX, \
		 buf:PTRBYTE, bufw:DWORD, bufh:DWORD, \
		 sx:DWORD, sy:DWORD, w:DWORD, h:DWORD, \
		 dstx:DWORD, dsty:DWORD, field:DWORD
	LOCAL bank:DWORD
	LOCAL w4:DWORD
	LOCAL new_src_line:DWORD
	LOCAL linestep:DWORD
	LOCAL new_dst_line:DWORD
    
	mov	ax, ds			; Insure es==ds for symantec flat mode
	mov	es, ax

	mov	eax, w			; w4 = w>>2
	shr	eax, 2
	mov	w4, eax

;;; <WIP>
;;; In stretched width mode, we either keep 4/5 (a) of the source pixels,
;;;  or duplicate every fourth pixel to magnify by 5/4 (b).
;;;  In these cases, new_src_line is either bufw-w*5/4 (a) or bufw-w*4/5 (b).
;;; Let ScaleStep be 5 (a) or 3 (b) instead of 4.  This is the amount to advance
;;;  the source after copying 32-bits from source to destination.
;;; The coordinate system used for the source will be a simulated scaled system.
;;; Rather than scale height, I plan to use alternate vertical resolutions.  However,
;;;  it might be a good idea to also provide for scaled height in case we want a
;;;  higher resolution border.
;;; Question: Do we still need to support transferring subrectangles?

if SCALING
	.if	opt_hscale_step==4
endif
	mov	eax, bufw		; new_src_line = bufw - w
	sub	eax, w
	mov	new_src_line, eax
if SCALING
	.else
	mov	eax, opt_hscale_adj
	mov	new_src_line, eax
	.endif
endif
	
	mov	eax, sf_LineWidth	; linestep = sf_LineWidth<<1;
	.if field			; if (field)
	  add	eax, eax		;  linestep <<= 1;
	.endif
	mov	linestep, eax

	sub	eax, w			; new_dst_line = linestep - w;
	mov	new_dst_line, eax

	mov	eax, sy			; buf += sy*bufw + sx
	mul	bufw
	add	eax, sx
	add	buf, eax

	mov	eax, sx			; dstx += sx
	add	dstx, eax

	; <WIP> This is a hack.  We should pass in src x,y of origin
	;  or make dstx/dsty absolute.
	;
	mov	eax, bufw		; if (field && sx >= (bufw>>1)
	shr	eax, 1
	.if field && sx >= eax
	  sub dstx, eax			;  dstx -= bufw>>1
	.endif

	mov	eax, sy			; dsty += sy
	add	dsty, eax

	.if sf_SetBank==0	;------------------


	; dst = WriteWinPtr + (dsty*linestep+dstx)
	mov	edi, sf_WriteWinPtr
	mov	eax, dsty
	mul	linestep
	add	eax, dstx
	add	edi, eax

	.if field & 1
	  add edi, sf_LineWidth;
	.endif

	mov	eax, new_src_line
	mov	edx, new_dst_line
	mov	esi, buf
	mov	ebx, h
if SCALING
	.if opt_hscale_step==3
	sub	edi, 8
sf_lp2a:mov	ecx, w4
	shr	ecx, 2
	ALIGN 4
sf_lp2b:mov	eax, [esi]
	mov	[edi+8], eax
	mov	eax, [esi+3]
	mov	[edi+12], eax
	add	edi, 16
	mov	eax, [esi+6]
	mov	[edi], eax
	mov	eax, [esi+9]
	mov	[edi+4], eax
	add	esi, 12
	dec	ecx
	jnz	sf_lp2b
	; To avoid problem of last pixel coming from next line
	; with arrange for w%16==12, so here is where we copy
	; last 12 pixels.
	mov	eax, [esi]
	mov	[edi+8], eax
	mov	eax, [esi+3]
	mov	[edi+12], eax
	add	edi, 12
	mov	eax, [esi+6]
	mov	[edi+4], eax
	add	esi, 9
	add	esi, new_src_line
	add	edi, edx
	dec	ebx
	jnz	sf_lp2a
	add	edi, 8
	.else
endif
sf_lp:	mov	ecx, w4 ;width/4
	rep movsd
	add	esi, eax
	add	edi, edx
	dec	ebx
	jnz	sf_lp
if SCALING
	.endif
endif

	.else ; sf_SetBank	;------------------


	mov	esi, buf

	; start = dsty * linestep + dstx

	mov	eax, linestep
	mul	dsty
	.if field & 1
	 add	eax, sf_LineWidth
	.endif
	add	eax, dstx
	;  bank = start / WinGran
	;  dst = (start % WinGran) + sf_WriteWinPtr
	mov	edx, 0
	div	sf_WinGran
	mov	bank, eax
	mov	edi, edx
	add	edi, sf_WriteWinPtr

	; Select new bank
	mov	bh, 0
	mov	bl, byte ptr sf_WriteWin
	mov	edx, bank
	call	sf_SetBank
	; eax/edx destroyed by sf_SetBank

sf_0:	; rem = sf_WriteWinLimit - dst
	mov	eax, sf_WriteWinLimit
	sub	eax, edi
	; h2 = (rem+(LineWidth-w))/LineWidth
	add	eax, linestep
	sub	eax, w
	mov	edx, 0
	div	linestep
	; if (h<h2) h2=h
	cmp	h, eax
	jae	skplim
	mov	eax, h
skplim: ; if (h2==0) // No full lines can be transfered.
	or	eax, eax
	jz	sf_2
	; h-= h2
	; Transfer h2 lines to screen
	sub	h, eax
	mov	ebx, new_src_line
	mov	edx, new_dst_line
if SCALING
	.if opt_hscale_step==3
	sub	edi, 8
sf_1a3:	mov	ecx, w4
	shr	ecx, 2
	ALIGN 4
sf_1b3:	mov	ebx, [esi]
	mov	[edi+8], ebx
	mov	ebx, [esi+3]
	mov	[edi+12], ebx
	add	edi, 16
	mov	ebx, [esi+6]
	mov	[edi], ebx
	mov	ebx, [esi+9]
	mov	[edi+4], ebx
	add	esi, 12
	dec	ecx
	jnz	sf_1b3
	; To avoid problem of last pixel coming from next line
	; with arrange for w%16==12, so here is where we copy
	; last 12 pixels.
	mov	ebx, [esi]
	mov	[edi+8], ebx
	mov	ebx, [esi+3]
	mov	[edi+12], ebx
	add	edi, 12
	mov	ebx, [esi+6]
	mov	[edi+4], ebx
	add	esi, 9
	add	esi, new_src_line
	add	edi, edx
	dec	eax
	jnz	sf_1a3
	add	edi, 8
	.else
endif
sf_1:	mov	ecx, w4  ; width/4
	rep movsd
	add	esi, ebx
	add	edi, edx
	dec	eax
	jnz	sf_1
if SCALING
	.endif
endif

sf_2:	; if (h!=0) // There are still lines to be transfered
	;	    //	transfer partial line
	or	eax, h
	jz	sf_9

	; w4a = 0 max (sf_WriteWinLimit-dst)/4
	mov	ecx, sf_WriteWinLimit
	sub	ecx, edi
	sar	ecx, 2
	jns	sf_2b
	mov	ecx, 0
sf_2b:	push	ecx		; Save size of first half
if SCALING
	.if opt_hscale_step==3
	
	ALIGN 4
	or	ecx, ecx
	jz	sf_2c3
sf_2b3:	mov	eax, [esi]
	mov	[edi], eax
	add	esi, 3
	add	edi, 4
	dec	ecx
	jnz	sf_2b3
sf_2c3:

	.else
endif
	rep movsd
if SCALING
	.endif
endif

	; bank += WinSize/WinGran //Assumes WinSize%WinGran==0
	; off -= (WinSize/WinGran)*WinGran == WinSize

	mov	eax, sf_WinGranPerSize
	add	bank, eax
	sub	edi, sf_WinSize

	; Select new bank
	mov	bh, 0
	mov	bl, byte ptr sf_WriteWin
	mov	edx, bank
	call	sf_SetBank
	; eax/edx destroyed by sf_SetBank

	; w4b = w4-w4a // Size of second half
	pop	eax		; Size of first half
	mov	ecx, w4
	sub	ecx, eax	; Size of 2nd half
if SCALING
	.if opt_hscale_step==3
	ALIGN 4
	or	ecx, ecx
	jz	sf_8b3
sf_8a3:	mov	eax, [esi]
	mov	[edi], eax
	add	esi, 3
	add	edi, 4
	dec	ecx
	jnz	sf_8a3
sf_8b3:
	.else
endif
	rep movsd
if SCALING
	.endif
endif
	add	esi, new_src_line
	add	edi, new_dst_line
	;; --h // Count split line
	;; if (h!=0) continue
	dec	h
	jnz	sf_0
sf_9:

	.endif ; sf_SetBank==0	;------------------

	ret

mve_ShowFrameField ENDP

; Special version of mve_ShowFrameField which converts from
; 256 colors to 15-bit color.  Does not support horizontal
; magnification.

;void mve_ShowFrameFieldHi(
;	      unsigned char *buf, unsigned bufw, unsigned bufh,
;	      unsigned sx, unsigned sy, unsigned w, unsigned h,
;	      unsigned dstx, unsigned dsty, unsigned field)

mve_ShowFrameFieldHi PROC USES ESI EDI EBX, \
		 buf:PTRBYTE, bufw:DWORD, bufh:DWORD, \
		 sx:DWORD, sy:DWORD, w:DWORD, h:DWORD, \
		 dstx:DWORD, dsty:DWORD, field:DWORD
	LOCAL bank:DWORD
	LOCAL w4:DWORD
	LOCAL new_src_line:DWORD
	LOCAL linestep:DWORD
	LOCAL new_dst_line:DWORD
    
	mov	ax, ds			; Insure es==ds for symantec flat mode
	mov	es, ax

	mov	eax, w			; w4 = w>>1
	shr	eax, 1
	mov	w4, eax			; Number of dst words to transfer

	mov	eax, bufw		; new_src_line = bufw - w
	sub	eax, w
	mov	new_src_line, eax

	mov	eax, sf_LineWidth	; linestep = sf_LineWidth<<1;
	.if field			; if (field)
	  add	eax, eax		;  linestep <<= 1;
	.endif
	mov	linestep, eax

	sub	eax, w			; new_dst_line = linestep - w*2;
	sub	eax, w
	mov	new_dst_line, eax

	mov	eax, sy			; buf += sy*bufw + sx
	mul	bufw
	add	eax, sx
	add	buf, eax

	mov	eax, sx			; dstx += sx
	add	dstx, eax

	; <WIP> This is a hack.  We should pass in src x,y of origin
	;  or make dstx/dsty absolute.
	;
	mov	eax, bufw		; if (field && sx >= (bufw>>1)
	shr	eax, 1
	.if field && sx >= eax
	  sub dstx, eax			;  dstx -= bufw>>1
	.endif

	mov	eax, sy			; dsty += sy
	add	dsty, eax

	.if sf_SetBank==0	;------------------


	; dst = WriteWinPtr + (dsty*linestep+dstx*2)
	mov	edi, sf_WriteWinPtr
	mov	eax, dsty
	mul	linestep
	add	eax, dstx
	add	eax, dstx
	add	edi, eax

	.if field & 1
	  add edi, sf_LineWidth;
	.endif

	mov	esi, buf
	mov	ebx, h
sf_lp:	mov	ecx, w4 ;width/4

	push	ebx
	lea	ebx, pal15_tbl
	xor	eax, eax
sf_movsd1:
	mov	al, [esi]
	add	esi, 2
	mov	dx, [ebx+eax*2]
	mov	al, [esi-1]
	shl	edx, 16
	mov	dx, [ebx+eax*2]
	rol	edx, 16
	mov	[edi], edx
	add	edi, 4
	dec	ecx
	jnz	sf_movsd1
	pop	ebx

;	rep movsd		;;;;;-----;;;;;

	add	esi, new_src_line
	add	edi, new_dst_line
	dec	ebx
	jnz	sf_lp

	.else ; sf_SetBank	;------------------


	mov	esi, buf

	; start = dsty * linestep + dstx*2

	mov	eax, linestep
	mul	dsty
	.if field & 1
	 add	eax, sf_LineWidth
	.endif
	add	eax, dstx
	add	eax, dstx
	;  bank = start / WinGran
	;  dst = (start % WinGran) + sf_WriteWinPtr
	mov	edx, 0
	div	sf_WinGran
	mov	bank, eax
	mov	edi, edx
	add	edi, sf_WriteWinPtr

	; Select new bank
	mov	bh, 0
	mov	bl, byte ptr sf_WriteWin
	mov	edx, bank
	call	sf_SetBank
	; eax/edx destroyed by sf_SetBank

sf_0:	; rem = sf_WriteWinLimit - dst
	mov	eax, sf_WriteWinLimit
	sub	eax, edi
	; h2 = (rem+(LineWidth-w*2))/LineWidth
	add	eax, new_dst_line
	mov	edx, 0
	div	linestep
	; if (h<h2) h2=h
	cmp	h, eax
	jae	skplim
	mov	eax, h
skplim: ; if (h2==0) // No full lines can be transfered.
	or	eax, eax
	jz	sf_2
	; h-= h2
	; Transfer h2 lines to screen
	sub	h, eax
	lea	ebx, pal15_tbl
sf_1:	mov	ecx, w4  ; width/4

	push	eax
	xor	eax, eax
sf_movsd2:
	mov	al, [esi]
	add	esi, 2
	mov	dx, [ebx+eax*2]
	mov	al, [esi-1]
	shl	edx, 16
	mov	dx, [ebx+eax*2]
	rol	edx, 16
	mov	[edi], edx
	add	edi, 4
	dec	ecx
	jnz	sf_movsd2
	pop	eax

;	rep movsd		;;;;;-----;;;;;

	add	esi, new_src_line
	add	edi, new_dst_line
	dec	eax
	jnz	sf_1

sf_2:	; if (h!=0) // There are still lines to be transfered
	;	    //	transfer partial line
	or	eax, h
	jz	sf_9

	; w4a = 0 max (sf_WriteWinLimit-dst)/4
	mov	ecx, sf_WriteWinLimit
	sub	ecx, edi
	sar	ecx, 2
	jns	sf_2b
	mov	ecx, 0
sf_2b:	push	ecx		; Save size of first half
	or	ecx, ecx
	jz	sf_movsd3b
	xor	eax, eax
	lea	ebx, pal15_tbl
sf_movsd3:
	mov	al, [esi]
	add	esi, 2
	mov	dx, [ebx+eax*2]
	mov	al, [esi-1]
	shl	edx, 16
	mov	dx, [ebx+eax*2]
	rol	edx, 16
	mov	[edi], edx
	add	edi, 4
	dec	ecx
	jnz	sf_movsd3
sf_movsd3b:

;	rep movsd		;;;;;-----;;;;;

	; bank += WinSize/WinGran //Assumes WinSize%WinGran==0
	; off -= (WinSize/WinGran)*WinGran == WinSize

	mov	eax, sf_WinGranPerSize
	add	bank, eax
	sub	edi, sf_WinSize

	; Select new bank
	mov	bh, 0
	mov	bl, byte ptr sf_WriteWin
	mov	edx, bank
	call	sf_SetBank
	; eax/edx destroyed by sf_SetBank

	; w4b = w4-w4a // Size of second half
	pop	eax		; Size of first half
	mov	ecx, w4
	sub	ecx, eax	; Size of 2nd half

	or	ecx, ecx
	jz	sf_movsd4b
	lea	ebx, pal15_tbl
	xor	eax, eax
sf_movsd4:
	mov	al, [esi]
	add	esi, 2
	mov	dx, [ebx+eax*2]
	mov	al, [esi-1]
	shl	edx, 16
	mov	dx, [ebx+eax*2]
	rol	edx, 16
	mov	[edi], edx
	add	edi, 4
	dec	ecx
	jnz	sf_movsd4
sf_movsd4b:

;	rep movsd		;;;;;-----;;;;;

	add	esi, new_src_line
	add	edi, new_dst_line
	;; --h // Count split line
	;; if (h!=0) continue
	dec	h
	jnz	sf_0
sf_9:

	.endif ; sf_SetBank==0	;------------------

	ret

mve_ShowFrameFieldHi ENDP

if PARTIAL

SHOW_FRAME_CHG_BODY MACRO HI_COLOR_FLAG:REQ

	LOCAL HI_COLOR_SCALE
HI_COLOR_SCALE equ HI_COLOR_FLAG+1

	mov	ax, ds			; Insure es==ds for symantec flat mode
	mov	es, ax

	mov	eax, w			; _width = w*SWIDTH*HI_COLOR_SCALE;
	shl	eax, LOG2_SWIDTH+HI_COLOR_FLAG
	mov	_width, eax

	xor	ebx, ebx		; ebx = nf_fqty (converted to 32-bits)
	mov	bl, nf_fqty

	mov	eax, nf_width		; SrcWidth = nf_width*nf_fqty;
	mul	ebx	;nf_fqty
	mov	SrcWidth, eax
	imul	eax, (SHEIGHT-1)	; SrcWidth7 = SrcWidth * (SHEIGHT-1)
	mov	SrcWidth7, eax
	add	eax, SrcWidth		; SrcLineStep = SrcWidth*SHEIGHT-_width
	sub	eax, _width
	mov	SrcLineStep, eax

	mov	eax, sf_LineWidth	; DstWidth = sf_LineWidth*nf_fqty;
	mul	ebx	;nf_fqty
	mov	DstWidth, eax
	imul	eax, (SHEIGHT-1)	; DstWidth7 = DstWidth * (SHEIGHT-1)
	mov	DstWidth7, eax
        ;Note: DstLineStep1+2 = DstWidth*SHEIGHT - _width =   ????Not True!!!
	dec	eax			; DstLineStep1 = DstWidth*(SHEIGHT-1)-1
	mov	DstLineStep1, eax

	mov	eax, DstWidth		; DstLineStep2 = DstWidth-_width+1
	sub	eax, _width
	inc	eax
	mov	DstLineStep2, eax

	mov	eax, DstLineStep1	; LineEnd = DstWidth*(SHEIGHT-1)+_width-1
	add	eax, _width
	mov	LineEnd, eax
    
	; esi = buf (pointer into buf)
	; ebx = pointer into chgs
	; dx = current chgs word
	; edi = pointer into screen
	; ecx = remaining square lines to copy

	.if prvbuf			; buf = prvbuf ? nf_buf_prv : nf_buf_cur
	  mov	esi, nf_buf_prv
	.else
	  mov	esi, nf_buf_cur
	.endif
	mov	eax, y			;  + y*SHEIGHT*nf_WIDTH
	shl	eax, LOG2_SHEIGHT
	mul	nf_width
	add	esi, eax
	mov	eax, x			;  + x*SWIDTH*HI_COLOR_SCALE
	shl	eax, LOG2_SWIDTH+HI_COLOR_FLAG
	add	esi, eax
	
	; dstx must be a multiple of 4 because everything is done on 32-bit words
	;  and bank crossing checks don't check for a crossing within a word.
	and	dstx, NOT 3		; dstx &= ~3

	mov	ebx, chgs
	mov	dx, 0
	
	mov	cl, nf_fqty
nxtfld:	push	ecx
	push	esi

	mov	ecx, h

	push	ebx
	push	edx

	mov	eax, sf_LineWidth
	mul	dsty
	add	eax, dstx
	;  bank = start / WinGran
	;  dst = (start % WinGran) + sf_WriteWinPtr
	mov	edx, 0
	div	sf_WinGran
	mov	bank, eax
	mov	edi, edx
	add	edi, sf_WriteWinPtr

	; Select new bank
	cmp	sf_SetBank, 0
	jz	nobank
	mov	bh, 0
	mov	bl, byte ptr sf_WriteWin
	mov	edx, bank
	call	sf_SetBank
	; eax/edx destroyed by sf_SetBank
nobank:	pop	edx
	pop	ebx

NextLine:
	push	ecx

	mov	eax, edi
	add	eax, LineEnd		; (SHEIGHT-1)*DstWidth+_width-1
	sub	eax, sf_WriteWinLimit
	jb	NoSplit
	jmp	Split

LineDone:
	pop	ecx
	add	esi, SrcLineStep	; Move back to start column, down SHEIGHT
	add	edi, DstLineStep1	; First advance to last byte
	add	edi, DstLineStep2	; Then advance to new start
	loop	NextLine
	pop	esi
	pop	ecx
	add	esi, nf_width
	inc	dsty
	dec	cl
	jnz	nxtfld
	jmp	Finished

	; --- Copy full squares ---

	; Scan over contiguous unchanged squares up to max per line
	; For each unchanged square, add 8 (SWIDTH) to esi and edi.

	; count # of contiguous changed squares up to max per line
	; Init eax to 0, ebx and edx to line steps for source and dest.
	; For each square, add 2 (SWIDTH/4) to eax and subtract 8 (SWIDTH)
	;   from ebx and edx.

NoSplit:	
	mov	ecx, w
	mov	eax, 0
	jmp	fTest1
	
fNext1:	mov	dx, [ebx]
	add	ebx, 2
fTest1:	add	dx, dx
	jz	fNext1
	jb	fChgd2
	add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	loop	fTest1
	jmp	LineDone

fNext2:	mov	dx, word ptr [ebx]
	add	ebx, 2
fTest2:	add	dx, dx
	ja	fCopy3
	jz	fNext2
fChgd2:	add	eax, SWIDTH*HI_COLOR_SCALE/4
	loop	fTest2
	call	fCopy
	jmp	LineDone

fCopy3:	call	fCopy
	add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	mov	eax, 0
	loop	fTest1
	jmp	LineDone

fCopy:	push 	ebx
	push	ecx
	push	edx

	mov	ecx, eax
	shl	ecx, 2
	mov	ebx, SrcWidth
	sub	ebx, ecx
	mov	edx, DstWidth
	sub	edx, ecx
	
	REPEAT 7
	  mov	ecx, eax
	  rep	movsd
	  add	esi, ebx
	  add	edi, edx
	ENDM
    	mov	ecx, eax
	rep	movsd

	sub	esi, SrcWidth7
	sub	edi, DstWidth7

	pop	edx
	pop	ecx
	pop	ebx
	retn


	; --- Copy squares across bank boundary ---
	; (occurs infrequently, but should be streamlined as much as possible
	;  because it could potentially be much more expensive than normal
	;  operation).
	; HMMM... 16*640 = 10240 = approx 1/6 64K, so for 640x480,
	;  roughly 1 in 6 square lines will need special processing
	;  (actually, 2 in 12 due to interlacing, but that's the same ratio).

	; Repeat above twice, once for end of cur bank, once for start
	;  of next bank, with following modifications:

	; For end of cur bank...
	;   if di>=WinSize, we're done with squares (but need to correctly
	;	adjust si and di!)
	;   if di+4*cx>WinSize, we need to clip and then we're done
	;	(but need to correctly adjust si and di!)
	;	Reduce cx to (WinSize-di)/4.

	; limit=WriteWinLimit-4*ax
	
Split:	push	ebx
	push	edx
	push	esi
	push	edi

	mov	ecx, w
	mov	eax, 0
	jmp	aTest1
	
aNext1:	mov	dx, word ptr [ebx]
	add	ebx, 2
aTest1:	add	dx, dx
	jz	aNext1
	jb	aChgd2
	add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	loop	aTest1
	jmp	aDone

aNext2:	mov	dx, [ebx]
	add	ebx, 2
aTest2:	add	dx, dx
	ja	aCopy3
	jz	aNext2
aChgd2:	add	eax, SWIDTH*HI_COLOR_SCALE/4
	loop	aTest2
	call	aCopy
	jmp	aDone	


aCopy3:	call	aCopy
	add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	mov	eax, 0
	loop	aTest1
	jmp	aDone

aCopy:	push 	ebx
	push	ecx
	push	edx
	push	esi
	push	edi

	mov	ecx, eax
	shl	ecx, 2
	mov	ebx, SrcWidth
	sub	ebx, ecx
	mov	edx, DstWidth
	sub	edx, ecx
	sub	ecx, sf_WriteWinLimit
	neg	ecx
	mov	limit, ecx

	REPEAT 7	
	  cmp	edi, limit
	  jns	aFinal
    	  mov	ecx, eax
	  rep	movsd
	  add	esi, ebx
	  add	edi, edx
	ENDM
	cmp	edi, limit
	jns	aFinal
    	mov	ecx, eax
    	jmp	aLast

aFinal:	mov	ecx, sf_WriteWinLimit
	sub	ecx, edi
	js	aCpyDn
	shr	ecx, 2
aLast:	rep	movsd

aCpyDn:	pop	edi
	pop	esi
	mov	ecx, eax
	shl	ecx, 2
	add	esi, ecx
	add	edi, ecx
	pop	edx
	pop	ecx
	pop	ebx
	retn
	
aDone:	pop	edi
	pop	esi

	; Advance bank
	mov	eax, sf_WinGranPerSize
	add	bank, eax
	sub	edi, sf_WinSize

	; Select new bank
	cmp	sf_SetBank, 0
	jz	nobank2
	mov	bh, 0
	mov	bl, byte ptr sf_WriteWin
	mov	edx, bank
	call	sf_SetBank
	; eax/edx destroyed by SetBank
nobank2:
	pop	edx
	pop	ebx	

	; For start of next bank...
	;   While di+4*cx<=0, advance si & di by src/dst line step instead of
	;     doing rep mov
	;   If di<0, cx += di/4, si-=di, di=0
	;   Do remaining rep mov's (first with modified args, remainder with
	;    with full args).

	; Init bx/dx to src/dst line steps.
	; limit=-4*ax

	mov	ecx, w
	mov	eax, 0
	jmp	bTest1
	
bNext1:	mov	dx, [ebx]
	add	ebx, 2
bTest1:	add	dx, dx
	jz	bNext1
	jb	bChgd2
	add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	loop	bTest1
	jmp	LineDone

bNext2:	mov	dx, [ebx]
	add	ebx, 2
bTest2:	add	dx, dx
	ja	bCopy3
	jz	bNext2
bChgd2:	add	eax, SWIDTH*HI_COLOR_SCALE/4
	loop	bTest2
	call	bCopy
	jmp	LineDone


bCopy3:	call	bCopy
	add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	mov	eax, 0
	loop	bTest1
	jmp	LineDone

bCopy:	push 	ebx
	push	ecx
	push	edx

	mov	ecx, eax
	shl	ecx, 2
	neg	ecx
	mov	limit, ecx
	mov	ebx, SrcWidth
	mov	edx, DstWidth

	sub	edi, sf_WriteWinPtr

	FOR bMovN, <bMov1,bMov2,bMov3,bMov4,bMov5,bMov6,bMov7>
	  mov	ecx, offset bMovN
	  jns	bFull
 	  cmp	limit, edi
	  js	bPart
 	  add	esi, ebx
	  add	edi, edx
	ENDM

	mov	ecx, offset bMov8
	jns	bFull
	cmp	limit, edi
	js	bPart

	add	edi, sf_WriteWinPtr

	shl	eax, 2
	add	esi, eax
	add	edi, eax

	jmp	bCpyDn


bFull:  push	ecx
	mov	ecx, eax
	add	ebx, limit
	add	edx, limit
	add	edi, sf_WriteWinPtr
	retn

bPart:	push	ecx
	mov	ecx, eax
	sub	esi, edi
	sar	edi, 2
	add	ecx, edi
	mov	edi, sf_WriteWinPtr
	add	ebx, limit
	add	edx, limit
	retn


	FOR bMovN, <bMov1,bMov2,bMov3,bMov4,bMov5,bMov6,bMov7>
bMovN:	  rep	movsd
	  mov	ecx, eax
	  add	esi, ebx
	  add	edi, edx
	ENDM
bMov8:	rep	movsd

bCpyDn:	sub	esi, SrcWidth7
	sub	edi, DstWidth7

	pop	edx
	pop	ecx
	pop	ebx
	retn

Finished:
	ret

	ENDM ; SHOW_FRAME_CHG_BODY

;void
;mve_sfShowFrameChg(
;		   bool prvbuf,
;		   unsigned x, unsigned y, unsigned w, unsigned h,
;		   unsigned short *chgs,
;		   unsigned dstx, unsigned dsty)
;
mve_sfShowFrameChg PROC USES ESI EDI EBX, \
		prvbuf:DWORD, \
		x:DWORD, y:DWORD, w:DWORD, h:DWORD, \
		chgs:PTRWORD, \
		dstx:DWORD, dsty:DWORD
	LOCAL _width:DWORD
	LOCAL SrcWidth:DWORD
	LOCAL DstWidth:DWORD
	LOCAL SrcWidth7:DWORD
	LOCAL DstWidth7:DWORD
	LOCAL SrcLineStep:DWORD
	LOCAL DstLineStep1:DWORD
	LOCAL DstLineStep2:DWORD
	LOCAL LineEnd:DWORD
	LOCAL bank:DWORD
	LOCAL limit:DWORD

	SHOW_FRAME_CHG_BODY 0 ; Not HiColor

mve_sfShowFrameChg ENDP


if HICOLOR

;void
;mve_sfHiColorShowFrameChg(
;		   bool prvbuf,
;		   unsigned x, unsigned y, unsigned w, unsigned h,
;		   unsigned short *chgs,
;		   unsigned dstx, unsigned dsty)
;
mve_sfHiColorShowFrameChg PROC USES ESI EDI EBX, \
		prvbuf:DWORD, \
		x:DWORD, y:DWORD, w:DWORD, h:DWORD, \
		chgs:PTRWORD, \
		dstx:DWORD, dsty:DWORD
	LOCAL _width:DWORD
	LOCAL SrcWidth:DWORD
	LOCAL DstWidth:DWORD
	LOCAL SrcWidth7:DWORD
	LOCAL DstWidth7:DWORD
	LOCAL SrcLineStep:DWORD
	LOCAL DstLineStep1:DWORD
	LOCAL DstLineStep2:DWORD
	LOCAL LineEnd:DWORD
	LOCAL bank:DWORD
	LOCAL limit:DWORD

	SHOW_FRAME_CHG_BODY 1 ; HiColor

mve_sfHiColorShowFrameChg ENDP


endif ;HICOLOR

endif ;PARTIAL


;----------------------------------------------------------------------

if 0 ; No supported
if PKDATA

PK_SHOW_FRAME_CHG_BODY MACRO HI_COLOR_FLAG:REQ

	LOCAL HI_COLOR_SCALE
HI_COLOR_SCALE equ HI_COLOR_FLAG+1

	mov	eax, w			; _width = w*SWIDTH*HI_COLOR_SCALE;
	shl	eax, LOG2_SWIDTH+HI_COLOR_FLAG
	mov	_width, eax

	xor	ebx, ebx		; ebx = nf_fqty (converted to 32-bits)
	mov	bl, nf_fqty

	mov	eax, nf_width		; SrcWidth = nf_width*nf_fqty;
	mul	ebx	;nf_fqty
	mov	SrcWidth, eax
	imul	eax, (SHEIGHT-1)	; SrcWidth7 = SrcWidth * (SHEIGHT-1)
	mov	SrcWidth7, eax
	add	eax, SrcWidth		; SrcLineStep = SrcWidth*SHEIGHT-_width
	sub	eax, _width
	mov	SrcLineStep, eax

	mov	eax, sf_LineWidth	; DstWidth = sf_LineWidth*nf_fqty;
	mul	ebx	;nf_fqty
	mov	DstWidth, eax
	imul	eax, (SHEIGHT-1)	; DstWidth7 = DstWidth * (SHEIGHT-1)
	mov	DstWidth7, eax
        ;Note: DstLineStep1+2 = DstWidth*SHEIGHT - _width =   ????Not True!!!
	dec	eax			; DstLineStep1 = DstWidth*(SHEIGHT-1)-1
	mov	DstLineStep1, eax

	mov	eax, DstWidth		; DstLineStep2 = DstWidth-_width+1
	sub	eax, _width
	inc	eax
	mov	DstLineStep2, eax

	mov	eax, DstLineStep1	; LineEnd = DstWidth*(SHEIGHT-1)+_width-1
	add	eax, _width
	mov	LineEnd, eax
    
	; esi = buf (pointer into buf)
	; ebx = pointer into ops
	; dx = temp for current op.  dl xor dh keeps just upper nibble op.
	; edi = pointer into screen
	; ecx = remaining square lines to copy

	.if prvbuf			; buf = prvbuf ? nf_buf_prv : nf_buf_cur
	  mov	esi, nf_buf_prv
	.else
	  mov	esi, nf_buf_cur
	.endif
	mov	eax, y			;  + y*SHEIGHT*nf_WIDTH
	shl	eax, LOG2_SHEIGHT
	mul	nf_width
	add	esi, eax
	mov	eax, x			;  + x*SWIDTH*HI_COLOR_SCALE
	shl	eax, LOG2_SWIDTH+HI_COLOR_FLAG
	add	esi, eax
	
	; dstx must be a multiple of 4 because everything is done on 32-bit words
	;  and bank crossing checks don't check for a crossing within a word.
	and	dstx, NOT 3		; dstx &= ~3

	mov	ebx, ops
	
	mov	cl, nf_fqty
nxtfld:	push	ecx
	push	esi

	mov	ecx, h

	push	ebx

	mov	eax, sf_LineWidth
	mul	dsty
	add	eax, dstx
	;  bank = start / WinGran
	;  dst = (start % WinGran) + sf_WriteWinPtr
	mov	edx, 0
	div	sf_WinGran
	mov	bank, eax
	mov	edi, edx
	add	edi, sf_WriteWinPtr

	; Select new bank
	cmp	sf_SetBank, 0
	jz	nobank
	mov	bh, 0
	mov	bl, byte ptr sf_WriteWin
	mov	edx, bank
	call	sf_SetBank
	; eax/edx destroyed by sf_SetBank
nobank:	pop	ebx

NextLine:
	push	ecx

	mov	eax, edi
	add	eax, LineEnd		; (SHEIGHT-1)*DstWidth+_width-1
	sub	eax, sf_WriteWinLimit
	jb	NoSplit
	jmp	Split

LineDone:
	pop	ecx
	add	esi, SrcLineStep	; Move back to start column, down SHEIGHT
	add	edi, DstLineStep1	; First advance to last byte
	add	edi, DstLineStep2	; Then advance to new start
	loop	NextLine
	pop	esi
	pop	ecx
	add	esi, nf_width
	inc	dsty
	dec	cl
	jnz	nxtfld
	jmp	Finished

	; --- Copy full squares ---

	; Scan over contiguous unchanged squares up to max per line
	; For each unchanged square, add 8 (SWIDTH) to esi and edi.

	; count # of contiguous changed squares up to max per line
	; Init eax to 0, ebx and edx to line steps for source and dest.
	; For each square, add 2 (SWIDTH/4) to eax and subtract 8 (SWIDTH)
	;   from ebx and edx.

NoSplit:	
	mov	ecx, w
	shr	ecx, 1
	mov	eax, 0
fNext1:	mov	dl, [ebx]
	inc	ebx
	mov	dh, dl
	and	dh, 0Fh
	jnz	fChgd2a
fTest1a:add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	xor	dl, dh
	jnz	fChgd2b
fTest1b:add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	dec	ecx
	jnz	fNext1
	jmp	LineDone

fNext2:	mov	dl, [ebx]
	inc	ebx
	mov	dh, dl
	and	dh, 0Fh
	jz	fCopy3a
fChgd2a:add	eax, SWIDTH*HI_COLOR_SCALE/4
	xor	dl, dh
	jz	fCopy3b
fChgd2b:add	eax, SWIDTH*HI_COLOR_SCALE/4
	dec	ecx
	jnz	fNext2
	call	fCopy
	jmp	LineDone

fCopy3a:call	fCopy
	xor	eax, eax
	jmp	fTest1a

fCopy3b:call	fCopy
	xor	eax, eax
	jmp	fTest1b

fCopy:	push 	ebx
	push	ecx
	push	edx

	mov	ecx, eax
	shl	ecx, 2
	mov	ebx, SrcWidth
	sub	ebx, ecx
	mov	edx, DstWidth
	sub	edx, ecx
	
	REPEAT 7
	  mov	ecx, eax
	  rep	movsd
	  add	esi, ebx
	  add	edi, edx
	ENDM
    	mov	ecx, eax
	rep	movsd

	sub	esi, SrcWidth7
	sub	edi, DstWidth7

	pop	edx
	pop	ecx
	pop	ebx
	retn


	; --- Copy squares across bank boundary ---
	; (occurs infrequently, but should be streamlined as much as possible
	;  because it could potentially be much more expensive than normal
	;  operation).
	; HMMM... 16*640 = 10240 = approx 1/6 64K, so for 640x480,
	;  roughly 1 in 6 square lines will need special processing
	;  (actually, 2 in 12 due to interlacing, but that's the same ratio).

	; Repeat above twice, once for end of cur bank, once for start
	;  of next bank, with following modifications:

	; For end of cur bank...
	;   if di>=WinSize, we're done with squares (but need to correctly
	;	adjust si and di!)
	;   if di+4*cx>WinSize, we need to clip and then we're done
	;	(but need to correctly adjust si and di!)
	;	Reduce cx to (WinSize-di)/4.

	; limit=WriteWinLimit-4*ax
	
Split:	push	ebx
	push	esi
	push	edi

	mov	ecx, w
	shr	ecx, 1
	mov	eax, 0
aNext1:	mov	dl, [ebx]
	inc	ebx
	mov	dh, dl
	and	dh, 0Fh
	jnz	aChgd2a
aTest1a:add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	xor	dl, dh
	jnz	aChgd2b
aTest1b:add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	dec	ecx
	jnz	aNext1
	jmp	LineDone

aNext2:	mov	dl, [ebx]
	inc	ebx
	mov	dh, dl
	and	dh, 0Fh
	jz	aCopy3a
aChgd2a:add	eax, SWIDTH*HI_COLOR_SCALE/4
	xor	dl, dh
	jz	aCopy3b
aChgd2b:add	eax, SWIDTH*HI_COLOR_SCALE/4
	dec	ecx
	jnz	aNext2
	call	aCopy
	jmp	LineDone

aCopy3a:call	aCopy
	xor	eax, eax
	jmp	aTest1a

aCopy3b:call	aCopy
	xor	eax, eax
	jmp	aTest1b

aCopy:	push 	ebx
	push	ecx
	push	edx
	push	esi
	push	edi

	mov	ecx, eax
	shl	ecx, 2
	mov	ebx, SrcWidth
	sub	ebx, ecx
	mov	edx, DstWidth
	sub	edx, ecx
	sub	ecx, sf_WriteWinLimit
	neg	ecx
	mov	limit, ecx

	REPEAT 7	
	  cmp	edi, limit
	  jns	aFinal
    	  mov	ecx, eax
	  rep	movsd
	  add	esi, ebx
	  add	edi, edx
	ENDM
	cmp	edi, limit
	jns	aFinal
    	mov	ecx, eax
    	jmp	aLast

aFinal:	mov	ecx, sf_WriteWinLimit
	sub	ecx, edi
	js	aCpyDn
	shr	ecx, 2
aLast:	rep	movsd

aCpyDn:	pop	edi
	pop	esi
	mov	ecx, eax
	shl	ecx, 2
	add	esi, ecx
	add	edi, ecx
	pop	edx
	pop	ecx
	pop	ebx
	retn
	
aDone:	pop	edi
	pop	esi

	; Advance bank
	mov	eax, sf_WinGranPerSize
	add	bank, eax
	sub	edi, sf_WinSize

	; Select new bank
	cmp	sf_SetBank, 0
	jz	nobank2
	mov	bh, 0
	mov	bl, byte ptr sf_WriteWin
	mov	edx, bank
	call	sf_SetBank
	; eax/edx destroyed by SetBank
nobank2:
	pop	ebx	

	; For start of next bank...
	;   While di+4*cx<=0, advance si & di by src/dst line step instead of
	;     doing rep mov
	;   If di<0, cx += di/4, si-=di, di=0
	;   Do remaining rep mov's (first with modified args, remainder with
	;    with full args).

	; Init bx/dx to src/dst line steps.
	; limit=-4*ax

	mov	ecx, w
	shr	ecx, 1
	mov	eax, 0
bNext1:	mov	dl, [ebx]
	inc	ebx
	mov	dh, dl
	and	dh, 0Fh
	jnz	bChgd2a
bTest1a:add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	xor	dl, dh
	jnz	bChgd2b
bTest1b:add	esi, SWIDTH*HI_COLOR_SCALE
	add	edi, SWIDTH*HI_COLOR_SCALE
	dec	ecx
	jnz	bNext1
	jmp	LineDone

bNext2:	mov	dl, [ebx]
	inc	ebx
	mov	dh, dl
	and	dh, 0Fh
	jz	bCopy3a
bChgd2a:add	eax, SWIDTH*HI_COLOR_SCALE/4
	xor	dl, dh
	jz	bCopy3b
bChgd2b:add	eax, SWIDTH*HI_COLOR_SCALE/4
	dec	ecx
	jnz	bNext2
	call	bCopy
	jmp	LineDone

bCopy3a:call	bCopy
	xor	eax, eax
	jmp	bTest1a

bCopy3b:call	bCopy
	xor	eax, eax
	jmp	bTest1b

bCopy:	push 	ebx
	push	ecx
	push	edx

	mov	ecx, eax
	shl	ecx, 2
	neg	ecx
	mov	limit, ecx
	mov	ebx, SrcWidth
	mov	edx, DstWidth

	sub	edi, sf_WriteWinPtr

	FOR bMovN, <bMov1,bMov2,bMov3,bMov4,bMov5,bMov6,bMov7>
	  mov	ecx, offset bMovN
	  jns	bFull
 	  cmp	limit, edi
	  js	bPart
 	  add	esi, ebx
	  add	edi, edx
	ENDM

	mov	ecx, offset bMov8
	jns	bFull
	cmp	limit, edi
	js	bPart

	add	edi, sf_WriteWinPtr

	shl	eax, 2
	add	esi, eax
	add	edi, eax

	jmp	bCpyDn


bFull:  push	ecx
	mov	ecx, eax
	add	ebx, limit
	add	edx, limit
	add	edi, sf_WriteWinPtr
	retn

bPart:	push	ecx
	mov	ecx, eax
	sub	esi, edi
	sar	edi, 2
	add	ecx, edi
	mov	edi, sf_WriteWinPtr
	add	ebx, limit
	add	edx, limit
	retn


	FOR bMovN, <bMov1,bMov2,bMov3,bMov4,bMov5,bMov6,bMov7>
bMovN:	  rep	movsd
	  mov	ecx, eax
	  add	esi, ebx
	  add	edi, edx
	ENDM
bMov8:	rep	movsd

bCpyDn:	sub	esi, SrcWidth7
	sub	edi, DstWidth7

	pop	edx
	pop	ecx
	pop	ebx
	retn

Finished:
	ret

	ENDM ; PK_SHOW_FRAME_CHG_BODY

;void
;mve_sfPkShowFrameChg(
;		   bool prvbuf,
;		   unsigned x, unsigned y, unsigned w, unsigned h,
;		   unsigned char *ops,
;		   unsigned dstx, unsigned dsty)
;
mve_sfPkShowFrameChg PROC USES ESI EDI EBX, \
		prvbuf:DWORD, \
		x:DWORD, y:DWORD, w:DWORD, h:DWORD, \
		ops:PTRBYTE, \
		dstx:DWORD, dsty:DWORD
	LOCAL _width:DWORD
	LOCAL SrcWidth:DWORD
	LOCAL DstWidth:DWORD
	LOCAL SrcWidth7:DWORD
	LOCAL DstWidth7:DWORD
	LOCAL SrcLineStep:DWORD
	LOCAL DstLineStep1:DWORD
	LOCAL DstLineStep2:DWORD
	LOCAL LineEnd:DWORD
	LOCAL bank:DWORD
	LOCAL limit:DWORD

	PK_SHOW_FRAME_CHG_BODY 0 ; Not HiColor

mve_sfPkShowFrameChg ENDP


if HICOLOR

;void
;mve_sfPkHiColorShowFrameChg(
;		   bool prvbuf,
;		   unsigned x, unsigned y, unsigned w, unsigned h,
;		   unsigned char *ops,
;		   unsigned dstx, unsigned dsty)
;
mve_sfPkHiColorShowFrameChg PROC USES ESI EDI EBX, \
		prvbuf:DWORD, \
		x:DWORD, y:DWORD, w:DWORD, h:DWORD, \
		ops:PTRBYTE, \
		dstx:DWORD, dsty:DWORD
	LOCAL _width:DWORD
	LOCAL SrcWidth:DWORD
	LOCAL DstWidth:DWORD
	LOCAL SrcWidth7:DWORD
	LOCAL DstWidth7:DWORD
	LOCAL SrcLineStep:DWORD
	LOCAL DstLineStep1:DWORD
	LOCAL DstLineStep2:DWORD
	LOCAL LineEnd:DWORD
	LOCAL bank:DWORD
	LOCAL limit:DWORD

	PK_SHOW_FRAME_CHG_BODY 1 ; HiColor

mve_sfPkHiColorShowFrameChg ENDP


endif ;HICOLOR

endif ;PKDATA

endif

;---------------------------------------------------------------------
; Palette Management
;---------------------

;void __cdecl
;MVE_SetPalette(unsigned char *p, unsigned start, unsigned count)
;
MVE_SetPalette PROC USES ESI EBX, \
			p:PTRBYTE, start:DWORD, count:DWORD
	mov	eax, start
	mov	ecx, count
	mov	esi, p
	.if eax>=256		; if (start>=256) return;
	  ret
	.endif
	lea	ebx, [eax+ecx]	; if (start+count>256)
	.if ebx>256
	  mov	ecx, 256	;   count = 256-start
	  sub	ecx, eax
	.endif
	add	esi, eax	; p += start*3
	add	esi, eax
	add	esi, eax
	lea	ecx, [ecx+2*ecx] ; count *= 3

    	mov	edx, 03c8h	; DAC Write Index Register
    	out	dx, al	        ; Init write index to start
    	inc	edx		; DAC Data Register
	rep outsb
	ret
MVE_SetPalette ENDP


; If at least 11 palette entries aren't changed, this is more compact
;  than uncompressed 256 entry palette.
;
;static void palLoadCompPalette(unsigned char *buf)
;
palLoadCompPalette PROC USES ESI EDI, \
			buf: PTRBYTE
	mov	ax, ds			; Insure es==ds for symantec flat mode
	mov	es, ax

	mov	cx, 32
	mov	esi, buf
	mov	edi, offset pal_tbl
next:	lodsb
	or	al, al
	jnz	chk0
	add	edi, 24
	loop	next
	jmp	done

chk0:	test	al, 1
	jz	not0
	movsw
	movsb
	test	al, 2
	jz	not1
cpy1:	movsw
	movsb
	test	al, 4
	jz	not2
cpy2:	movsw
	movsb
	test	al, 8
	jz	not3
cpy3:	movsw
	movsb
	test	al, 16
	jz	not4
cpy4:	movsw
	movsb
	test	al, 32
	jz	not5
cpy5:	movsw
	movsb
	test	al, 64
	jz	not6
cpy6:	movsw
	movsb
	or	al, al
	jns	not7
cpy7:	movsw
	movsb
	loop	next
	jmp	done

not0:	add	edi, 3
	test	al, 2
	jnz	cpy1
not1:	add	edi, 3
	test	al, 4
	jnz	cpy2
not2:	add	edi, 3
	test	al, 8
	jnz	cpy3
not3:	add	edi, 3
	test	al, 16
	jnz	cpy4
not4:	add	edi, 3
	test	al, 32
	jnz	cpy5
not5:	add	edi, 3
	test	al, 64
	jnz	cpy6
not6:	add	edi, 3
	or	al, al
	js	cpy7
not7:	add	edi, 3
	loop	next

done:	ret

palLoadCompPalette ENDP

;-----------------------------------------------------------------------
; Graphics
;----------

gfxMode proc USES EBP ESI EDI EBX, mode:DWORD
	mov	eax, mode
	int	10h
	ret
gfxMode endp

gfxLoadCrtc proc USES ESI EDI EBX, crtc:PTRBYTE, chain4:BYTE, res:BYTE

	mov	edx, 03c4h	; alter sequence registers
	mov	al, 04h		; disable or enable chain 4 in memory mode
	mov	ah, chain4
	out	dx, ax

	mov	dx, 03dah	; General Input State #1 register

l1:	in	al, dx		; Loop until vertical retrace is off
	test	al, 8
	jnz	l1
l2:	in	al, dx		; Now loop until it's back on
	test	al, 8
	jz	l2

	cli			; turn off all interrupts
	mov	edx, 03c4h	; Sequencer Synchronous reset
	mov	eax, 0100h	; Set sequencer reset
	out	dx, ax
	mov	edx, 03c2h	; Misc Output Register
	mov	al, res 	;  25/28-mHz, 350/400/480 lines
	out	dx, al
	mov	edx, 03c4h	; Sequencer Synchronous reset
	mov	eax, 0300h	; Clear sequencer reset
	out	dx, ax

	mov	edx, 03d4h	; 6845 CRTC
	mov	esi, crtc	; tweaked values for CRTC registers
	mov	al, 011h	; deprotect CRTC registers 0-7
	mov	ah, [esi+011h]
	and	ah, 07Fh
	out	dx, ax

	mov	ecx, 018h	; Update CRTC registers with tweaked values
	mov	ebx, 0
l3: 	mov 	al, bl
	mov	ah, [esi+ebx]
	out	dx, ax
	inc	bl
	loop	l3

	sti			; restore interrupts

	ret
gfxLoadCrtc endp

; void __cdecl gfxGetCrtc(unsigned char *crtc);
;
gfxGetCrtc proc USES ESI EBX, crtc:PTRBYTE
	mov	edx, 03d4h	; 6845 CRTC
	mov	esi, crtc
	mov	ecx, 018h
	mov	ebx, 0
l3: 	mov 	al, bl
	out	dx, al
	inc	dx
	in	al, dx
	dec	dx
	mov	[esi+ebx], al
	inc	bl
	loop	l3
	ret
gfxGetCrtc endp

; void __cdecl gfxVres(unsigned char misc, unsigned char *crtc);
;	misc is one of the following:
;		350:	0x23 | 0x80	(2)
;		400:	0x23 | 0x40	(1)
;		480:	0x23 | 0xc0	(3)

; Get crtc register specified by crtc_addr into ah.
; To update register, do out dx,ax
GetCrtc	MACRO crtc_addr
	mov	al, crtc_addr
	out	dx, al
	inc	dx
	in	al, dx
	dec	dx
	mov	ah, al
	mov	al, crtc_addr
	ENDM

gfxVres PROC USES EBX, misc:BYTE, crtc:PTRBYTE

	mov	edx, 03dah	; General Input State #1 register

l1:	in	al, dx		; Loop until vertical retrace is off
	test	al, 8
	jnz	l1
l2:	in	al, dx		; Now loop until it's back on
	test	al, 8
	jz	l2

	cli			; turn off all interrupts

	mov	edx, 03c4h	; Sequencer Synchronous reset
	mov	eax, 0100h	; Set sequencer reset
	out	dx, ax
	mov	edx, 03cch	; Misc Output Register (read port)
	in	al, dx
	and	al, 03fh	;  Keep all but lines field
	mov	edx, 03c2h	; Misc Output Register (write port)
	and	misc, 0c0h	;  Only keep lines field
	or	al, misc	;  350/400/480 lines
	out	dx, al
	mov	edx, 03c4h	; Sequencer Synchronous reset
	mov	eax, 0300h	; Clear sequencer reset
	out	dx, ax

	mov	edx, 03d4h	; CRTC address port
	mov	ebx, crtc	; Desired CRTC image

	GetCrtc	011h		; Vertical Retrace End register
	and	ah, 07Fh	; Deprotect CRTC registers 0-7
	out	dx, ax

	GetCrtc 03h		; End Horizontal Blanking register
	or	ah, 080h	; Enable CRTC registers 10-11
	out	dx, ax

	mov	al, 06h		; Vertical Total register
	mov	ah, byte ptr 06h[ebx]
	out	dx, ax

	GetCrtc 07h		; Overflow register
	and	ah, 010h	;  (Preserve LC)
	or	ah, byte ptr 07h[ebx]
	out	dx, ax

	GetCrtc 09h		; Maximum Scan Line register
	and	ah, 040h	;  (Preserve LC)
	or	ah, byte ptr 09h[ebx]
	out	dx, ax

	mov	al, 010h	; Vertical Retrace Start register
	mov	ah, byte ptr 010h[ebx]
	out	dx, ax

	GetCrtc 11h		; Vertical Retrace End register
	and	ah, 070h	;  (Preserve BW,DVI,CVI)
	or	ah, byte ptr 011h[ebx]
	or	ah, 080h	; Reprotect 0-7
	out	dx, ax

	mov	al, 012h	; Vertical Display End register
	mov	ah, byte ptr 012h[ebx]
	out	dx, ax

	mov	al, 015h	; Start Vertical Blank register
	mov	ah, byte ptr 015h[ebx]
	out	dx, ax

; Some SVGA's use 7-bit vbe, others 8-bit vbe!
if 0
	GetCrtc 16h		; End Vertical Blank register
	and	ah, 080h	;  (Preserve reserved field)
	or	ah, byte ptr 016h[ebx]
	out	dx, ax
else
	mov	al, 16h		; End Vertical Blank register
	mov	ah, byte ptr 016h[ebx]
	out	dx, ax
endif

	sti			; restore interrupts

	ret
gfxVres ENDP

; void __cdecl MVE_gfxWaitRetrace(unsigned state);
;
MVE_gfxWaitRetrace proc state:DWORD
	mov	edx, 03dah	; Input Status #1 register
	mov	eax, state
	or	eax, eax
	jnz	wt1
wt0:	in	al, dx		; Wait for retrace off
	and	al, 8
	jnz	wt0
	ret

wt1:	in	al, dx		; Wait for retrace on
	and	al, 8
	jz	wt1
	ret

MVE_gfxWaitRetrace endp

; void __cdecl MVE_gfxSetSplit(unsigned line)
;
MVE_gfxSetSplit proc line:DWORD
	mov	edx, 03dah	; Input State #1 register
wt0:	in	al, dx		; Wait for retrace off
	and	al, 8
	jnz	wt0
wt1:	in	al, dx		; Wait for retrace on
	and	al, 8
	jz	wt1

	mov	edx, 03d4h	; CRTC address port

	mov	ecx, line
	shr	ecx, 4
	and	cl, 010h
	GetCrtc	07h		; Overflow Register
	and	ah, 0EFh	;  LC8 (mask=10h)
	or	ah, cl
	out	dx, ax

	mov	ecx, line
	shr	ecx, 3
	and	cl, 040h
	GetCrtc 09h		; Maximum Scan Line Register
	and	ah, 0BFh	;  LC9 (mask=40h)
	or	ah, cl
	out	dx, ax

	mov	al, 18h		; Line Compare Register
	mov	ah, byte ptr line
	out	dx, ax

	ret

MVE_gfxSetSplit endp

;----------------------------------------------------------------------

mveliba_end:

	
	END