/*
 * yuv_convert.cc --
 *
 *      Defines generic yuv_conversion routines
 *      
 *		destWidth and destHeight specify image dimensions for dest buffer
 *		srcWidth and srcHeight specify image dimensions for src buffer
 *		dimensions are specified in terms of size of video image
 *      --horizonal dimensions should be a multiple of 16, although 8 
 *        is acceptable for certain video formats
 *
 * Copyright (c) 2001 The Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * A. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * B. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * C. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include"yuv_convert.h"
#include<string.h>
#include<stdio.h>
#include"cpuinfo.h"

// #define VERIFY_MMX // causes status messages to be printed

//#define MMX_ENABLED // enables use of MMX code
#define EMMS_FLAG false // true means use emms quite often

#ifdef X86_ASSEMBLER
//if MMX used, then these functions can be used to align memory to 64-bit boundary
#ifndef WIN32
#define align64(c) ((((unsigned long)(c))&0x7)?(long *)((((((unsigned long)(c))>>3)+1)<<3)):(c))
typedef long long unsigned int i64;
#else
#define align64(c) ((((unsigned long)(c))&0x7)?(long *)((((((unsigned long)(c))>>3)+1)<<3)):(c))
#endif
#endif

//
//	planarYUYV422_to_planarYUYV422
//
//  This function (in the simple case) does a memory copy, but it also
//  can adjust the image dimensions
//
bool planarYUYV422_to_planarYUYV422(char* dest, int destWidth, int destHeight,
									char* src, int srcWidth, int srcHeight) {
#ifdef VERIFY_MMX
	printf("non-MMX: planarYUYV422_to_planarYUYV422 (no MMX exists)\n");
#endif
	if( (destWidth&0x1) || (srcWidth&0x1) ) {
		printf("wrong width in planarYUYV422_to_planarYUYV422\n");
		return false;
	}	
	if(destHeight!=srcHeight || destWidth!=srcWidth) {
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;
		
		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;
		int downPad=(srcHeight<destHeight)?(destHeight-srcHeight-upPad):0;
		int downClip=(srcHeight>destHeight)?(srcHeight-destHeight-upClip):0;
		int rows=(srcHeight>destHeight)?destHeight:srcHeight;
		
		int i;
		
		// copy the y data
		
		// handle the up padding on destination
		dest+=(destWidth*upPad);
		// handle the up clipping on source
		src+=(srcWidth*upClip);
		
		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				dest+=leftPad;
				memcpy(dest, src, srcWidth);
				dest+=srcWidth+rightPad;
				src+=srcWidth;
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=leftClip;
				memcpy(dest, src, destWidth);
				dest+=destWidth;
				src+=destWidth+rightClip;
			}
		}
		
		
		// handle the down padding on destination
		dest+=(destWidth*downPad);
		// handle the down clipping on source
		src+=(srcWidth*downClip);
		
		// copy the u data
		
		// handle the up padding on destination
		dest+=((destWidth*upPad)>>1);
		// handle the up clipping on source
		src+=((srcWidth*upClip)>>1);
		
		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				dest+=(leftPad>>1);
				memcpy(dest, src, (srcWidth>>1));
				dest+=((srcWidth+rightPad)>>1);
				src+=(srcWidth>>1);
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=(leftClip>>1);
				memcpy(dest, src, (destWidth>>1));
				dest+=(destWidth>>1);
				src+=((destWidth+rightClip)>>1);
			}
		}
		// handle the down padding on destination
		dest+=((destWidth*downPad)>>1);
		// handle the down clipping on source
		src+=((srcWidth*downClip)>>1);
		
		// copy the v data
		
		// handle the up padding on destination
		dest+=((destWidth*upPad)>>1);
		// handle the up clipping on source
		src+=((srcWidth*upClip)>>1);
		
		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				dest+=(leftPad>>1);
				memcpy(dest, src, (srcWidth>>1));
				dest+=((srcWidth+rightPad)>>1);
				src+=(srcWidth>>1);
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=(leftClip>>1);
				memcpy(dest, src, (destWidth>>1));
				dest+=(destWidth>>1);
				src+=((destWidth+rightClip)>>1);
			}
		}
		// no actions needed for final down padding
	}
	else { // sizes all the same, so can just copy data
		memcpy(dest, src, (size_t)((destHeight*destWidth)<<1));
	}
	return true;
}

//
//	planarYUYV422_to_planarYUYV411
//
// This function downsamples a planar-422 frame to a planar-411 one
bool planarYUYV422_to_planarYUYV411(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight) {
#ifdef VERIFY_MMX
	printf("non-MMX: planarYUYV422_to_planarYUYV411 (no MMX exists)\n");
#endif

	if( (destWidth&0x3) || (srcWidth&0x1) ) {
		printf("wrong width in planarYUYV422_to_planarYUYV411\n");
		return false;
	}

	if(destWidth!=srcWidth || destHeight!=srcHeight) {
        int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		leftPad=((leftPad>>2)<<2);
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;
		int downPad=(srcHeight<destHeight)?(destHeight-srcHeight-upPad):0;
		int downClip=(srcHeight>destHeight)?(srcHeight-destHeight-upClip):0;
		int rows=(srcHeight>destHeight)?destHeight:srcHeight;
		if( (leftPad&0x3) || (rightPad&0x3) ) {
			printf("horz padding must be multiple of 4 in planarYUYV422_to_planarYUYV411\n");
			return false;
		}

		int i;

		// copy the y data

		// handle the up padding on destination
		dest+=(destWidth*upPad);
		// handle the up clipping on source
		src+=(srcWidth*upClip);
	
		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				dest+=leftPad;
				memcpy(dest, src, srcWidth);
				dest+=srcWidth+rightPad;
				src+=srcWidth;
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=leftClip;
				memcpy(dest, src, destWidth);
				dest+=destWidth;
				src+=destWidth+rightClip;
			}
		}

		// handle the down padding on destination
		dest+=(destWidth*downPad);
		// handle the down clipping on source
		src+=(srcWidth*downClip);

		// copy the u data

		// handle the up padding on destination
		dest+=((destWidth*upPad)>>2);
		// handle the up clipping on source
		src+=((srcWidth*upClip)>>1);
		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				dest+=(leftPad>>2);
				for(int j=0; j<(srcWidth>>2); ++j) {
					*dest=*src;
					dest++;
					src+=2;
				}
				dest+=(rightPad>>2);
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=(leftClip>>1);
				for(int j=0; j<(destWidth>>2); ++j) {
					*dest=*src;
					dest++;
					src+=2;
				}
				src+=(rightClip>>1);
			}
		}
		// handle the down padding on destination
		dest+=((destWidth*downPad)>>2);
		// handle the down clipping on source
		src+=((srcWidth*downClip)>>1);

		// copy the v data

		// handle the up padding on destination
		dest+=((destWidth*upPad)>>2);
		// handle the up clipping on source
		src+=((srcWidth*upClip)>>1);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				dest+=((leftPad)>>2);
				for(int j=0; j<(srcWidth>>2); ++j) {
					*dest=*src;
					dest++;
					src+=2;
				}
				dest+=(rightPad>>2);
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=(leftClip>>1);
				for(int j=0; j<(destWidth>>2); ++j) {
					*dest=*src;
					dest++;
					src+=2;
				}
				src+=(rightClip>>1);
			}
		}
		// no actions needed for final down padding
	}
	else {
		int i;
		char *srcu,*srcv;
		char *dstu,*dstv;

		// copy the y data

		memcpy(dest, src, (size_t)destHeight*destWidth);

		// copy the u and v, downsampling by 2

		srcu = src + destHeight * destWidth;
		dstu = dest + destHeight * destWidth;
		srcv = srcu + ((destHeight * destWidth)>>1);
		dstv = dstu + ((destHeight * destWidth)>>2);

		for (i=0; i < ((destHeight*destWidth)>>2); ++i) {
			*(dstu++) = *(srcu++);
						srcu++;
			*(dstv++) = *(srcv++);
				          srcv++;
		}
	}
	return true;
}

//
//	planarYUYV422_to_planarYUYV420
//
// This function downsamples a planar-422 frame to a planar-420 one
bool planarYUYV422_to_planarYUYV420(char* dest, int destWidth, int destHeight,
									char* src, int srcWidth, int srcHeight)
{
#ifdef VERIFY_MMX
	printf("non-MMX: planarYUYV422_to_planarYUYV420 (no MMX exists)\n");
#endif
	
	if( (destWidth&0x1) || (srcWidth&0x1) ) {
		printf("wrong width in planarYUYV422_to_planarYUYV420\n");
		return false;
	}
	if( (destHeight&0x1) || (srcHeight&0x1) ) {
		printf("even height required in planarYUYV422_to_planarYUYV420\n");
		return false;
	}
	if(destWidth!=srcWidth || destHeight!=srcHeight) {
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		if(upPad&0x1) { --upPad; }
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;
		int downPad=(srcHeight<destHeight)?(destHeight-srcHeight-upPad):0;
		int downClip=(srcHeight>destHeight)?(srcHeight-destHeight-upClip):0;
		int rows=(srcHeight>destHeight)?destHeight:srcHeight;

		int i;

		// copy the y data

		// handle the y up padding
		dest+=(destWidth*upPad);
		// handle the y up clipping
		src+=(srcWidth*upClip);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				dest+=leftPad;
				memcpy(dest, src, srcWidth);
				dest+=srcWidth+rightPad;
				src+=srcWidth;
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=leftClip;
				memcpy(dest, src, destWidth);
				dest+=destWidth;
				src+=destWidth+rightClip;
			}
		}
		// handle the y down padding
		dest+=(destWidth*downPad);
		// handle the y down clipping
		src+=(srcWidth*downClip);

		// copy the u data

		// handle the u up padding
		dest+=((destWidth*upPad)>>2);
		// handle the u up clipping
		src+=((srcWidth*upClip)>>1);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<(rows>>1); i++) {
				dest+=(leftPad>>1);
				for(int j=0; j<(srcWidth>>1); ++j) {
					*dest=*src;
					dest++;
					src++;
				}
				dest+=(rightPad>>1);
				src+=(srcWidth>>1); // skip a row
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<(rows>>1); i++) {
				src+=(leftClip>>1);
				for(int j=0; j<(destWidth>>1); ++j) {
					*dest=*src;
					dest++;
					src++;
				}
				src+=(rightClip>>1);
				src+=(srcWidth>>1); // skip a row
			}
		}
		// handle the u down padding
		dest+=((destWidth*downPad)>>2);
		// handle the u down clipping
		src+=((srcWidth*downClip)>>1);

		// copy the v data

		// handle the v up padding
		dest+=((destWidth*upPad)>>2);
		// handle the v up clipping
		src+=((srcWidth*upClip)>>1);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<(rows>>1); i++) {
				dest+=(leftPad>>1);
				for(int j=0; j<(srcWidth>>1); ++j) {
					*dest=*src;
					dest++;
					src++;
				}
				dest+=(rightPad>>1);
				src+=(srcWidth>>1); // skip a row
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<(rows>>1); i++) {
				src+=(leftClip>>1);
				for(int j=0; j<(destWidth>>1); ++j) {
					*dest=*src;
					dest++;
					src++;
				}
				src+=(rightClip>>1);
				src+=(srcWidth>>1); // skip a row
			}
		}
		// no actions needed for final down padding
	}
	else {
		int i, j;
		char *srcu,*srcv;
		char *dstu,*dstv;
		srcu = src + destHeight * destWidth;
		srcv = srcu + ((destHeight * destWidth)>>1);
		dstu = dest + destHeight * destWidth;
		dstv = dstu + ((destHeight * destWidth)>>2);

		// copy the y
		memcpy (dest, src, (size_t)destHeight*destWidth);
		// copy the u and v, downsampling by 2
		for (i=(destHeight>>1); i>0; --i) {
			// even lines get all the chroma information
			for (j = (destWidth>>1); j > 0; j--) {
				*(dstu++) = *(srcu++);
				*(dstv++) = *(srcv++);
			}
			// odd lines get no chroma information
			srcu+=(destWidth>>1);
			srcv+=(destWidth>>1);
		}
	}
	return true;
}

/////////////////////////////////////////////////////////////////
//
//	planarYUYV420_to_planarYUYV422
//
//  This function (in the simple case) does a memory copy, but it also
//  can adjust the image dimensions
//
bool planarYUYV420_to_planarYUYV422(char* dest, int destWidth, int destHeight,
									char* src, int srcWidth, int srcHeight) {
#ifdef VERIFY_MMX
	printf("non-MMX: planarYUYV420_to_planarYUYV422 (no MMX exists)\n");
#endif
	if( (destWidth&0x1) || (srcWidth&0x1) ) {
		printf("wrong width in planarYUYV420_to_planarYUYV422\n");
		return false;
	}	
	if( (destHeight&0x1) || (srcHeight&0x1) ) {
		printf("wrong height in planarYUYV420_to_planarYUYV422\n");
		return false;
	}	
	if(destHeight!=srcHeight || destWidth!=srcWidth) {
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;
		
		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;
		int downPad=(srcHeight<destHeight)?(destHeight-srcHeight-upPad):0;
		int downClip=(srcHeight>destHeight)?(srcHeight-destHeight-upClip):0;
		int rows=(srcHeight>destHeight)?destHeight:srcHeight;
		
		int i;
		
		// copy the y data
		
		// handle the up padding on destination
		dest+=(destWidth*upPad);
		// handle the up clipping on source
		src+=(srcWidth*upClip);
		
		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				dest+=leftPad;
				memcpy(dest, src, srcWidth);
				dest+=srcWidth+rightPad;
				src+=srcWidth;
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=leftClip;
				memcpy(dest, src, destWidth);
				dest+=destWidth;
				src+=destWidth+rightClip;
			}
		}		
		
		// handle the down padding on destination
		dest+=(destWidth*downPad);
		// handle the down clipping on source
		src+=(srcWidth*downClip);
		
		// copy the u data
		
		// handle the up padding on destination
		dest+=((destWidth*upPad)>>1);
		// handle the up clipping on source
		src+=((srcWidth*upClip)>>2);
		
		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; i+=2) {
				// source information is every-other row, so double each line
				dest+=(leftPad>>1);
				memcpy(dest, src, (srcWidth>>1));
				dest+=((srcWidth+rightPad)>>1);

				dest+=(leftPad>>1);
				memcpy(dest, src, (srcWidth>>1));
				dest+=((srcWidth+rightPad)>>1);

				src+=(srcWidth>>1);
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; i+=2) {
				src+=(leftClip>>1);

				// source information is every-other row, so double each line
				memcpy(dest, src, (destWidth>>1));
				dest+=(destWidth>>1);

				memcpy(dest, src, (destWidth>>1));
				dest+=(destWidth>>1);

				src+=((destWidth+rightClip)>>1);
			}
		}
		// handle the down padding on destination
		dest+=((destWidth*downPad)>>1);
		// handle the down clipping on source
		src+=((srcWidth*downClip)>>2);
		
		// copy the v data
		
		// handle the up padding on destination
		dest+=((destWidth*upPad)>>1);
		// handle the up clipping on source
		src+=((srcWidth*upClip)>>2);
		
		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; i+=2) {
				// source information is every-other row, so double each line
				dest+=(leftPad>>1);
				memcpy(dest, src, (srcWidth>>1));
				dest+=((srcWidth+rightPad)>>1);

				dest+=(leftPad>>1);
				memcpy(dest, src, (srcWidth>>1));
				dest+=((srcWidth+rightPad)>>1);

				src+=(srcWidth>>1);
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; i+=2) {
				src+=(leftClip>>1);

				// source information is every-other row, so double each line
				memcpy(dest, src, (destWidth>>1));
				dest+=(destWidth>>1);

				memcpy(dest, src, (destWidth>>1));
				dest+=(destWidth>>1);

				src+=((destWidth+rightClip)>>1);
			}
		}
		// no actions needed for final down padding
	}
	else { // sizes all the same, so can just copy data
		memcpy(dest, src, (size_t)(destHeight*destWidth)); // copy y
		dest+=destHeight*destWidth;
		src+=destHeight*destWidth;

		// copy u and v information, doubling each row
		for(int i=0; i<destHeight; i+=2) {
			memcpy(dest, src, (size_t)(destWidth>>1)); // 1st copy to dest
			dest+=destWidth>>1;
			memcpy(dest, src, (size_t)(destWidth>>1)); // 2nd copy to dest
			dest+=destWidth>>1;
			src+=destWidth>>1;
		}
	}
	return true;
}

//
//	planarYUYV420_to_planarYUYV411 (unmodified, yet)
//
// This function downsamples a planar-420 frame to a planar-411 one
bool planarYUYV420_to_planarYUYV411(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight) {
#ifdef VERIFY_MMX
	printf("non-MMX: planarYUYV420_to_planarYUYV411 (no MMX exists)\n");
#endif

	if( (destWidth&0x3) || (srcWidth&0x1) ) {
		printf("wrong width in planarYUYV420_to_planarYUYV411\n");
		return false;
	}
	if( (destHeight&0x1) || (srcHeight&0x1) ) {
		printf("wrong height in planarYUYV420_to_planarYUYV411\n");
		return false;
	}	

	if(destWidth!=srcWidth || destHeight!=srcHeight) {
        int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		leftPad=((leftPad>>2)<<2);
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;
		int downPad=(srcHeight<destHeight)?(destHeight-srcHeight-upPad):0;
		int downClip=(srcHeight>destHeight)?(srcHeight-destHeight-upClip):0;
		int rows=(srcHeight>destHeight)?destHeight:srcHeight;
		if( (leftPad&0x3) || (rightPad&0x3) ) {
			printf("horz padding must be multiple of 4 in planarYUYV420_to_planarYUYV411\n");
			return false;
		}

		int i;

		// copy the y data

		// handle the up padding on destination
		dest+=(destWidth*upPad);
		// handle the up clipping on source
		src+=(srcWidth*upClip);
	
		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				dest+=leftPad;
				memcpy(dest, src, srcWidth);
				dest+=srcWidth+rightPad;
				src+=srcWidth;
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=leftClip;
				memcpy(dest, src, destWidth);
				dest+=destWidth;
				src+=destWidth+rightClip;
			}
		}

		// handle the down padding on destination
		dest+=(destWidth*downPad);
		// handle the down clipping on source
		src+=(srcWidth*downClip);

		// copy the u data

		// handle the up padding on destination
		dest+=((destWidth*upPad)>>2);
		// handle the up clipping on source
		src+=((srcWidth*upClip)>>2);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; i+=2) {
				dest+=(leftPad>>2);
				//for(int j=0; j<(srcWidth>>2); ++j) {
				//	*dest=*src;
				//	dest++;
				//	src+=2;
				//}
				for(int j=0; j<(srcWidth>>2); ++j) {
					*(dest) = *(src);
					*(dest+(destWidth>>2) ) = *(src);
					src+=2;
					++dest;
				}
				dest+=(rightPad>>2);

				dest+=(destWidth>>2); // did 2 rows above, to skip a row
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; i+=2) {
				src+=(leftClip>>1);
				//for(int j=0; j<(destWidth>>2); ++j) {
				//	*dest=*src;
				//	dest++;
				//	src+=2;
				//}
				for(int j=0; j<(destWidth>>2); ++j) {
					*(dest) = *(src);
					*(dest+(destWidth>>2) ) = *(src);
					src+=2;
					dest++;
				}

				src+=(rightClip>>1);

				dest+=(destWidth>>2); // did 2 rows above, to skip a row
			}
		}
		// handle the down padding on destination
		dest+=((destWidth*downPad)>>2);
		// handle the down clipping on source
		src+=((srcWidth*downClip)>>2);

		// copy the v data

		// handle the up padding on destination
		dest+=((destWidth*upPad)>>2);
		// handle the up clipping on source
		src+=((srcWidth*upClip)>>2);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; i+=2) {
				dest+=((leftPad)>>2);
				//for(int j=0; j<(srcWidth>>2); ++j) {
				//	*dest=*src;
				//	dest++;
				//	src+=2;
				//}
				for(int j=0; j<(srcWidth>>2); ++j) {
					*(dest) = *(src);
					*(dest+(destWidth>>2) ) = *(src);
					src+=2;
					dest++;
				}
				dest+=(rightPad>>2);

				dest+=(destWidth>>2); // did 2 rows above, to skip a row
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=(leftClip>>1);
				//for(int j=0; j<(destWidth>>2); ++j) {
				//	*dest=*src;
				//	dest++;
				//	src+=2;
				//}
				for(int j=0; j<(destWidth>>2); ++j) {
					*(dest) = *(src);
					*(dest+(destWidth>>2) ) = *(src);
					src+=2;
					dest++;
				}

				src+=(rightClip>>1);

				dest+=(destWidth>>2); // did 2 rows above, to skip a row
			}
		}
		// no actions needed for final down padding
	}
	else {
		int i;
		char *srcu;//,*srcv;
		char *dstu;//,*dstv;

		// copy the y data

		memcpy(dest, src, (size_t)destHeight*destWidth);

		// copy the u and v, adjusting locations

		srcu = src + destHeight * destWidth;
		dstu = dest + destHeight * destWidth;
		//srcv = srcu + ((destHeight * destWidth)>>2);
		//dstv = dstu + ((destHeight * destWidth)>>2);

		//for(i=0; i<destHeight; i+=2) {
		for(i=0; i<destHeight; i++) { // do u and v at the same time
			for(int j=0; j<(destWidth>>2); ++j) {
				*(dstu) = *(srcu);
				*(dstu+(destWidth>>2) ) = *(srcu);
				srcu+=2;
				dstu++;
			}
			dstu+=(destWidth>>2); // did 2 rows above, to skip a row
		}
	}
	return true;
}

//
//	planarYUYV420_to_planarYUYV420
//
// This function downsamples a planar-420 frame to a planar-420 one
bool planarYUYV420_to_planarYUYV420(char* dest, int destWidth, int destHeight,
									char* src, int srcWidth, int srcHeight)
{
#ifdef VERIFY_MMX
	printf("non-MMX: planarYUYV420_to_planarYUYV420 (no MMX exists)\n");
#endif
	
	if( (destWidth&0x1) || (srcWidth&0x1) ) {
		printf("wrong width in planarYUYV420_to_planarYUYV420\n");
		return false;
	}
	if( (destHeight&0x1) || (srcHeight&0x1) ) {
		printf("even height required in planarYUYV420_to_planarYUYV420\n");
		return false;
	}
	if(destWidth!=srcWidth || destHeight!=srcHeight) {
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		if(upPad&0x1) { --upPad; }
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;
		int downPad=(srcHeight<destHeight)?(destHeight-srcHeight-upPad):0;
		int downClip=(srcHeight>destHeight)?(srcHeight-destHeight-upClip):0;
		int rows=(srcHeight>destHeight)?destHeight:srcHeight;

		int i;

		// copy the y data

		// handle the y up padding
		dest+=(destWidth*upPad);
		// handle the y up clipping
		src+=(srcWidth*upClip);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				dest+=leftPad;
				memcpy(dest, src, srcWidth);
				dest+=srcWidth+rightPad;
				src+=srcWidth;
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				src+=leftClip;
				memcpy(dest, src, destWidth);
				dest+=destWidth;
				src+=destWidth+rightClip;
			}
		}
		// handle the y down padding
		dest+=(destWidth*downPad);
		// handle the y down clipping
		src+=(srcWidth*downClip);

		// copy the u data

		// handle the u up padding
		dest+=((destWidth*upPad)>>2);
		// handle the u up clipping
		src+=((srcWidth*upClip)>>2);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<(rows>>1); i++) {
				dest+=(leftPad>>1);

				memcpy(dest, src, (srcWidth>>1));
				dest+=(srcWidth>>1);
				src+=(srcWidth>>1);

				dest+=(rightPad>>1);
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<(rows>>1); i++) {
				src+=(leftClip>>1);

				memcpy(dest, src, (destWidth>>1));
				dest+=(destWidth>>1);
				src+=(destWidth>>1);

				src+=(rightClip>>1);
			}
		}
		// handle the u down padding
		dest+=((destWidth*downPad)>>2);
		// handle the u down clipping
		src+=((srcWidth*downClip)>>2);

		// copy the v data

		// handle the v up padding
		dest+=((destWidth*upPad)>>2);
		// handle the v up clipping
		src+=((srcWidth*upClip)>>2);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<(rows>>1); i++) {
				dest+=(leftPad>>1);

				memcpy(dest, src, (srcWidth>>1));
				dest+=(srcWidth>>1);
				src+=(srcWidth>>1);

				dest+=(rightPad>>1);
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<(rows>>1); i++) {
				src+=(leftClip>>1);

				memcpy(dest, src, (destWidth>>1));
				dest+=(destWidth>>1);
				src+=(destWidth>>1);

				src+=(rightClip>>1);
			}
		}
		// no actions needed for final down padding
	}
	else {
		// copy the y (w*h), u(w*h*0.25), and v (w*h*0.25), for a total of (w*h*1.5)
		memcpy (dest, src, (size_t)destHeight*destWidth+((destHeight*destWidth)>>1) );
	}
	return true;
}
/////////////////////////////////////////////////////////////////

//
//	packedYUYV422_to_planarYUYV422
//
// This function unpacks every frame into a planar form, i.e., does
//	YUYV YUYV ... YUYV -> YY ... Y UU ... U VV ... V

#ifndef X86_ASSEMBLER// if no MMX
bool packedYUYV422_to_planarYUYV422(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#else
bool packedYUYV422_to_planarYUYV422_nonMMX(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#endif
#ifdef VERIFY_MMX
	printf("non-MMX: packedYUYV422_to_planarYUYV422\n");
#endif
	if( (srcWidth&0x1) || (destWidth&0x1) ) {
		printf("width not 2 in packedYUYV422_to_planarYUYV422_nonMMX\n");
		return false;
	}
	if(destWidth==srcWidth && destHeight==srcHeight) {
		int i;
		char *s, *y,*u,*v;

		i =	((destWidth * destHeight)>>1);
		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>1);

		// packed representation is YUYV YUYV YUYV
		while (i--) {
			*(y++) = *(s++);
			*(u++) = *(s++);
			*(y++) = *(s++);
			*(v++) = *(s++);
		}
	}
	else {
		char *s, *y,*u,*v;

		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>1);

		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;

		int rows=(srcHeight>destHeight)?destHeight:srcHeight;

		// handle the y up padding
		y+=(destWidth*upPad);
		u+=((destWidth*upPad)>>1);
		v+=((destWidth*upPad)>>1);

		// handle the up clipping
		s+=((srcWidth*upClip)<<1);

		// copy the video
		int i;
		if(leftPad!=0 || rightPad!=0) { // if padding necessary on destination
			for(i=0; i<rows; ++i) {
				y+=leftPad;
				u+=(leftPad>>1);
				v+=(leftPad>>1);

				int j=(srcWidth>>1);
				// packed representation is YUYV YUYV YUYV
				while (j--) {
					*(y++) = *(s++);
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
				}
				y+=rightPad;
				u+=(rightPad>>1);
				v+=(rightPad>>1);
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				s+=(leftClip<<1);

				int j=(destWidth>>1);
				// packed representation is YUYV YUYV YUYV
				while (j--) {
					*(y++) = *(s++);
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
				}
				s+=(rightClip<<1);
			}
		}
		// don't perform any action for the padding/clipping on bottom of image
	}
	return true;
}
#ifdef X86_ASSEMBLER

#ifndef WIN32

// emms operations are relatively slow, so try to minimize them when possible when using
// a series of planar32bytes... function calls.  This is accomplished by passing false
// false issueEmms to the planar32bytes... functions, and then calling issueEmms()
inline void issueEmms()
{
	__asm__ __volatile__(
	"emms\n\t"
	:
	:
	:"memory");
}

inline void planar32bytesYUYV422(char * src, char* yy, char* uu, char * vv, int loops, bool issueEmms=true)
{
	int i = loops;
	char * s = src;
	char * y = yy;
	char * u = uu;
	char * v = vv;
	i64 __volatile__ andArray=0x00ff00ff00ff00ffLL;
	unsigned int __volatile__ array[5]={(int)s, int(y), int(u), int(v), i};
	unsigned int var=(unsigned int)(array);
	int __volatile__ dummy1,dummy2;

	__asm__ __volatile__(
				"push %%ebx\n\t"
				"push %%ecx\n\t"
				"push %%edx\n\t"
				"push %%esi\n\t"

				"mov  4(%%eax), %%ebx\n\t"
				"mov  8(%%eax), %%ecx\n\t"
				"mov 12(%%eax), %%edx\n\t"
				"mov 16(%%eax), %%esi\n\t"
				"mov  0(%%eax), %%eax\n\t"

				"movq   (%%edi), %%mm6\n\t"
				"top_of_loop1:\n\t"
				"movq   (%%eax), %%mm0\n\t"
				"movq  8(%%eax), %%mm1\n\t"
				"movq 16(%%eax), %%mm2\n\t"
				"movq 24(%%eax), %%mm3\n\t"

				"movq %%mm0, %%mm4\n\t"
				"movq %%mm1, %%mm5\n\t"

				"pand %%mm6, %%mm4\n\t"
				"pand %%mm6, %%mm5\n\t"

				"psrlw $8, %%mm0\n\t"
				"packuswb %%mm5, %%mm4\n\t"
				"psrlw $8, %%mm1\n\t"
				"movq %%mm4, (%%ebx)\n\t"

				"movq %%mm3, %%mm5\n\t"
				"movq %%mm2, %%mm4\n\t"

				"pand %%mm6, %%mm5\n\t"
				"pand %%mm6, %%mm4\n\t"

				"psrlw $8, %%mm2\n\t"
				"packuswb %%mm5, %%mm4\n\t"
				"psrlw $8, %%mm3\n\t"
				"movq %%mm4, 8(%%ebx)\n\t"

				"packuswb %%mm1, %%mm0\n\t"
				"packuswb %%mm3, %%mm2\n\t"

				"movq %%mm0, %%mm4\n\t"
				"movq %%mm2, %%mm5\n\t"

				"psrlw $8, %%mm4\n\t"
				"psrlw $8, %%mm5\n\t"

				"pand %%mm6, %%mm0\n\t"

				"packuswb %%mm5, %%mm4\n\t"

				"pand %%mm6, %%mm2\n\t"

				"movq %%mm4, (%%edx)\n\t"

				"packuswb %%mm2, %%mm0\n\t"

				"dec %%esi\n\t"

				"movq %%mm0, (%%ecx)\n\t"

				"add $32, %%eax\n\t"
				"add $16, %%ebx\n\t"
				"add $8, %%ecx\n\t"
				"add $8, %%edx\n\t"
				"cmp $0,%%esi\n\t"
				"jg top_of_loop1\n\t"
				//"emms\n\t"
				"pop %%esi\n\t"
				"pop %%edx\n\t"
				"pop %%ecx\n\t"
				"pop %%ebx\n\t"
				: "=a" (dummy1), "=D" (dummy2)
				: "a" (var) /*(array)*/ , "D" (&andArray)
				: "memory");
	if(issueEmms) {
			__asm__ __volatile__(
				"emms\n\t"
				: 
				: 
				: "memory");
	}
}

#else // windows and mmx case

// emms operations are relatively slow, so try to minimize them when possible when using
// a series of planar32bytes... function calls.  This is accomplished by passing false
// false issueEmms to the planar32bytes... functions, and then calling issueEmms()
inline void issueEmms()
{
	_asm
	{	
		emms            // empty MMX state
	}
}

inline void planar32bytesYUYV422(long * data, long* y, long* u, long* v, int loops, bool issueEmms=true)
{
	__int64 andArray=0x00ff00ff00ff00ff;
	_asm
	{	
		push edi	// push a register we really shouldn't mess with
		// Load pointers
		mov eax, [data]		// load ptr to source
		mov ebx, [y]		// location of Y dest
		mov ecx, [u]		// location of U dest
		mov edx, [v]		// location of V dest
		mov edi, [loops]	// load loop iterations
		movq mm6, [andArray]	// Load AND array

top_of_loop:		

		// Dereference pointers to load data into registers
		movq mm0, [eax]
		movq mm1, [eax+8]
		movq mm2, [eax+16]
		movq mm3, [eax+24]

		// Extract the first set of y's
		movq mm4, mm0		// Copy data to temp location
		movq mm5, mm1		// Copy data to temp location

		pand mm4, mm6		// strip off U/V's	<SA>
		pand mm5, mm6		// strip off U/V's	<SA>

		psrlw mm0, 8		// strip off Y's
		packuswb mm4, mm5	// Pack first half of Y's together
		psrlw mm1, 8		// strip off Y's

		movq [ebx], mm4		// Save Y's

		// Extract the second set of y's
		movq mm5, mm3		// Copy data to temp location
		movq mm4, mm2		// Copy data to temp location

		pand mm5, mm6		// strip off U/V's
		pand mm4, mm6		// strip off U/V's

		psrlw mm2, 8		// shift Y's away
		packuswb mm4, mm5	// Pack second half of Y's together
		psrlw mm3, 8		// shift Y's away
		movq [ebx+8], mm4	// Save Y's

		// Combine U's and V's together
		packuswb mm0,mm1	// Pack first half together
		packuswb mm2,mm3	// Pack second half together

		// Combine and save V's
		movq mm4, mm0		// Make a copy
		movq mm5, mm2		// Make a copy
		psrlw mm4, 8		// Shift down and strip off U's
		psrlw mm5, 8		// Shift down and strip off U's
		pand mm0, mm6		// strip off V's	<SA>
		packuswb mm4,mm5	// Pack Together
		pand mm2, mm6		// strip off V's <SA>
		movq [edx], mm4		// Save

		// Combine and save U's
		packuswb mm0,mm2	// Pack together
		dec edi				// Decrement loop count <SA>
		movq [ecx], mm0		// Save

		add eax, 32		// Advance pointer
		add ebx, 16		// Advance pointer

		add ecx, 8		// Advance pointer <SA>
		add edx, 8		// Advance pointer <SA>
		cmp edi,0		// Perform test
		jg top_of_loop	// jump back to top for another loop
        //emms            // empty MMX state
		pop edi			// restore the register
	}
	if(!issueEmms) return; {
		_asm
		{	
			emms            // empty MMX state
		}
	}
}

#endif // end of windows case

bool packedYUYV422_to_planarYUYV422(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
    if( !supportsMMX() ) {
  	return packedYUYV422_to_planarYUYV422_nonMMX(dest,destWidth,destHeight,
					src, srcWidth, srcHeight);
    }

#ifdef VERIFY_MMX
	printf("MMX: packedYUYV422_to_planarYUYV422\n");
#endif
#if 0
	if(destWidth!=srcWidth ) {
#ifdef VERIFY_MMX
		printf("size mismatch in video-v4l.cc packedYUYV422_to_planar422 MMX\n");
#endif
		return packedYUYV422_to_planarYUYV422_nonMMX(dest,destWidth,destHeight,
					src, srcWidth, srcHeight);
	}
#endif
	int rows=(destHeight<srcHeight)?destHeight:srcHeight;
	if( (srcWidth&0x1) || (destWidth&0x1) || ((destWidth*rows)&0xf) ) {
#ifdef VERIFY_MMX
		printf("size not 2/16 in video-v4l.cc packedYUYV422_to_planar422 MMX\n");
#endif
		return packedYUYV422_to_planarYUYV422_nonMMX(dest,destWidth,destHeight,
									  src, srcWidth, srcHeight);
	}
	char * aa=src;
	char * yy=dest;
	char * uu=dest+destWidth*destHeight;
	char * vv=uu+((destWidth*destHeight)>>1);
	if(destHeight!=srcHeight) {
		if(destHeight>srcHeight) {
			int difference=((destHeight-srcHeight)>>1);
			yy+=difference*destWidth;
			uu+=((difference*destWidth)>>1);
			vv+=((difference*destWidth)>>1);
		}
		else {
			int difference=((srcHeight-destHeight)>>1);
			aa+=((difference*srcWidth)<<1);
		}
	}
	if(destWidth!=srcWidth ) {
		if( (destWidth&0xf) || (srcWidth&0xf) ) {
#ifdef VERIFY_MMX
			printf("width mismatch not 16 in video-v4l.cc packedYUYV422_to_planar422 MMX\n");
#endif
			return packedYUYV422_to_planarYUYV422_nonMMX(dest,destWidth,destHeight,
					  src,srcWidth,srcHeight);
		}

		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		leftPad&=~(0x3); // round to 16 pixels
		leftClip&=~(0x3); // round to 16 pixels
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		if(leftPad!=0 || rightPad!=0) {
			int wideLoops=srcWidth>>4; // divide by 16 pixels
			for(int i=rows; i>0; --i) {
				yy+=leftPad; // pad the left side
				uu+=(leftPad>>1);
				vv+=(leftPad>>1);
#ifndef WIN32
				planar32bytesYUYV422( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
				planar32bytesYUYV422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
				aa+=(srcWidth<<1); // advance the pointers
				yy+=srcWidth;
				uu+=(srcWidth>>1);
				vv+=(srcWidth>>1);

				yy+=rightPad; // pad the right side
				uu+=(rightPad>>1);
				vv+=(rightPad>>1);
			}
			issueEmms(); // send an Emms instruction to reset the MMX state
		}
		else { // clipping !=0;
			int wideLoops=destWidth>>4; // divide by 16 pixels
			for(int i=rows; i>0; --i) {
				aa+=(leftClip<<1); // clip the left side
#ifndef WIN32
				planar32bytesYUYV422( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
				planar32bytesYUYV422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
				aa+=(destWidth<<1); // advance the pointers
				yy+=destWidth;
				uu+=(destWidth>>1);
				vv+=(destWidth>>1);

				aa+=(rightClip<<1); // clip the right side
			}
			issueEmms(); // send an Emms instruction to reset the MMX state
		} // end clipping case
		return true;
	}
	else {
		int loops=((destWidth * rows)>>4);
#ifndef WIN32
		planar32bytesYUYV422( aa, yy, uu, vv, loops);
#else
		planar32bytesYUYV422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, loops);
#endif
	}
	return true;
}

#endif // end of mmx case for packedYUYV422_to_planarYUYV422

//
//	packedUYVY422_to_planarYUYV422
//
// This function unpacks every frame into a planar form, i.e., does
//	UYVY UYVY ... YUYV -> YY ... Y UU ... U VV ... V
#ifndef X86_ASSEMBLER

bool packedUYVY422_to_planarYUYV422(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#else
bool packedUYVY422_to_planarYUYV422_nonMMX(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#endif
#ifdef VERIFY_MMX
	printf("non-MMX: packedUYVY422_to_planarYUYV422\n");
#endif
	if( (destWidth&0x1) || (srcWidth&0x1) ) {
		printf("wrong width packedUYVY422_to_planarYUYV422_nonMMX\n");
		return false;
	}

	if(destWidth==srcWidth && destHeight==srcHeight) {
		int i;
		char *s, *y,*u,*v;

		i = ((destWidth * destHeight)>>1);
		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>1);

		// packed representation is UYVY UYVY UYVY
		while (i--) {
			*(u++) = *(s++);
			*(y++) = *(s++);
			*(v++) = *(s++);
			*(y++) = *(s++);
		}
	}
	else { // non-matching size case
		char *s, *y,*u,*v;

		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>1);

		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth< destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;

		int rows=(srcHeight>destHeight)?destHeight:srcHeight;

		// handle the y up padding
		y+=(destWidth*upPad);
		u+=((destWidth*upPad)>>1);
		v+=((destWidth*upPad)>>1);

		// handle the y up clipping
		s+=((srcWidth*upClip)<<1);

		// copy the video
		int i;
		if(leftPad!=0 || rightPad!=0) { // if padding necessary
			for(i=0; i<rows; ++i) {
				y+=leftPad;
				u+=(leftPad>>1);
				v+=(leftPad>>1);
	
				int j=(srcWidth>>1);
				// packed representation is YUYV YUYV YUYV
				while (j--) {
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
					*(y++) = *(s++);
				}
				y+=rightPad;
				u+=(rightPad>>1);
				v+=(rightPad>>1);
			}
		}
		else { // if clipping necessary on source
			for(i=0; i<rows; ++i) {
				s+=(leftClip<<1);

				int j=(destWidth>>1);
				// packed representation is YUYV YUYV YUYV
				while (j--) {
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
					*(y++) = *(s++);
				}
				s+=(rightClip<<1);
			}
		}
		// don't perform any action for the padding/clipping on bottom of image		
	}
	return true;
}
#ifdef X86_ASSEMBLER

#ifndef WIN32 // non-windows case

inline void planar32bytesUYVY422(char * src, char* yy, char* uu, char * vv, int loops, bool issueEmms=true)
{
	i64 __volatile__ andArray=0x00ff00ff00ff00ffLL;
	unsigned int __volatile__ array[5]={(int)src, int(yy), int(uu), int(vv), loops};
	unsigned int var=(unsigned int)(array);
	int dummy1,dummy2;
	//	volatile int /*__volatile__*/ dummy1,dummy2;

	__asm__ __volatile__(
		       "push %%ebx\n\t"
		       "push %%ecx\n\t"
		       "push %%edx\n\t"
		       "push %%esi\n\t"

		       "mov  4(%%eax), %%ebx\n\t"
		       "mov  8(%%eax), %%ecx\n\t"
		       "mov 12(%%eax), %%edx\n\t"
		       "mov 16(%%eax), %%esi\n\t"
		       "mov  0(%%eax), %%eax\n\t"

		       "movq   (%%edi), %%mm6\n\t"
		       "top_of_loop2:\n\t"
		       "movq   (%%eax), %%mm0\n\t"
		       "movq  8(%%eax), %%mm1\n\t"
		       "movq 16(%%eax), %%mm2\n\t"
		       "movq 24(%%eax), %%mm3\n\t"

		       "movq %%mm0, %%mm4\n\t"
		       "movq %%mm1, %%mm5\n\t"

		       "psrlw $8, %%mm4\n\t"
		       "psrlw $8, %%mm5\n\t"

		       "pand %%mm6, %%mm0\n\t"
		       "packuswb %%mm5, %%mm4\n\t"
		       "pand %%mm6, %%mm1\n\t"
		       "movq %%mm4, (%%ebx)\n\t"

		       "movq %%mm3, %%mm5\n\t"
		       "movq %%mm2, %%mm4\n\t"

		       "psrlw $8, %%mm5\n\t"
		       "psrlw $8, %%mm4\n\t"

		       "pand %%mm6, %%mm2\n\t"
		       "packuswb %%mm5, %%mm4\n\t"
		       "pand %%mm6, %%mm3\n\t"
		       "movq %%mm4, 8(%%ebx)\n\t"

		       "packuswb %%mm1, %%mm0\n\t"
		       "packuswb %%mm3, %%mm2\n\t"

		       "movq %%mm0, %%mm4\n\t"
		       "movq %%mm2, %%mm5\n\t"

		       "psrlw $8, %%mm4\n\t"
		       "psrlw $8, %%mm5\n\t"

		       "pand %%mm6, %%mm0\n\t"

		       "packuswb %%mm5, %%mm4\n\t"

		       "pand %%mm6, %%mm2\n\t"

		       "movq %%mm4, (%%edx)\n\t"

		       "packuswb %%mm2, %%mm0\n\t"

		       "dec %%esi\n\t"

		       "movq %%mm0, (%%ecx)\n\t"

		       "add $32, %%eax\n\t"
		       "add $16, %%ebx\n\t"
		       "add $8, %%ecx\n\t"
		       "add $8, %%edx\n\t"
		       "cmp $0,%%esi\n\t"
		       "jg top_of_loop2\n\t"
//		       "emms\n\t"
		       "pop %%esi\n\t"
		       "pop %%edx\n\t"
		       "pop %%ecx\n\t"
		       "pop %%ebx\n\t"
		       : "=a" (dummy1) , "=D" (dummy2)
		       : "a" (var) , "D" (&andArray) 
		       //(array) , "D" (&andArray)
		       : "memory");
	if(issueEmms) {
			__asm__ __volatile__(
				"emms\n\t"
				: 
				: 
				:"memory");
	}
}

#else // mmx and windows case

inline void planar32bytesUYVY422(long * data, long* y, long* u, long* v, int loops, bool issueEmms=true)
{
	__int64 andArray=0x00ff00ff00ff00ff;
	_asm
	{	
		push edi	// push a register we really shouldn't mess with
		// Load pointers
		mov eax, [data]		// load ptr to source
		mov ebx, [y]		// location of Y dest
		mov ecx, [u]		// location of U dest
		mov edx, [v]		// location of V dest
		mov edi, [loops]	// load loop iterations
		movq mm6, [andArray]	// Load AND array

top_of_loop:		

		// Dereference pointers to load data into registers
		movq mm0, [eax]
		movq mm1, [eax+8]
		movq mm2, [eax+16]
		movq mm3, [eax+24]

		// Extract the first set of y's
		movq mm4, mm0		// Copy data to temp location
		movq mm5, mm1		// Copy data to temp location
		psrlw mm4, 8		// Get high bits (Y) & shift U/V away
		psrlw mm5, 8		// Get high bits (Y) & shift U/V away
		pand mm0, mm6		// strip off Y's	<SA>
		packuswb mm4, mm5	// Pack first half of Y's together
		pand mm1, mm6		// strip off Y's	<SA>
		movq [ebx], mm4		// Save Y's

		// Extract the second set of y's
		movq mm5, mm3		// Copy data to temp location
		movq mm4, mm2		// Copy data to temp location
		psrlw mm5, 8		// Get high bits (Y) & shift U/V away
		psrlw mm4, 8		// Get high bits (Y) & shift U/V away
		pand mm2, mm6		// strip off Y's	<SA>
		packuswb mm4, mm5	// Pack second half of Y's together
		pand mm3, mm6		// strip off Y's	<SA>
		movq [ebx+8], mm4	// Save Y's

		// Combine U's and V's together
		packuswb mm0,mm1	// Pack first half together
		packuswb mm2,mm3	// Pack second half together

		// Combine and save V's
		movq mm4, mm0		// Make a copy
		movq mm5, mm2		// Make a copy
		psrlw mm4, 8		// Shift down and strip off U's
		psrlw mm5, 8		// Shift down and strip off U's
		pand mm0, mm6		// strip off V's	<SA>
		packuswb mm4,mm5	// Pack Together
		pand mm2, mm6		// strip off V's <SA>
		movq [edx], mm4		// Save

		// Combine and save U's
		packuswb mm0,mm2	// Pack together
		dec edi				// Decrement loop count <SA>
		movq [ecx], mm0		// Save

		add eax, 32		// Advance pointer
		add ebx, 16		// Advance pointer

		add ecx, 8		// Advance pointer <SA>
		add edx, 8		// Advance pointer <SA>
		cmp edi,0		// Perform test
		jg top_of_loop	// jump back to top for another loop
//        emms            // empty MMX state
		pop edi			// restore the register
	}
	if(!issueEmms) return;{
		_asm
		{	
			emms            // empty MMX state
		}
	}
}
#endif // end of windows case

bool packedUYVY422_to_planarYUYV422(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#ifdef VERIFY_MMX
	printf("MMX: packedUYVY422_to_planarYUYV422\n");
#endif
    if(!supportsMMX()) {
	return packedUYVY422_to_planarYUYV422_nonMMX(dest,destWidth,destHeight,
						  src,srcWidth,srcHeight);
    }

#if 0
	if(destWidth!=srcWidth ) {
#ifdef VERIFY_MMX
		printf("size mismatch in video-v4l.cc packedUYVY422_to_planar422 MMX\n");
#endif
		return packedUYVY422_to_planarYUYV422_nonMMX(dest,destWidth,destHeight,
									  src,srcWidth,srcHeight);
	}
#endif
	int rows=(destHeight<srcHeight)?destHeight:srcHeight;
	if( (srcWidth&0x1) || (destWidth&0x1) || ((destWidth*rows)&0xf) ) {
#ifdef VERIFY_MMX
		printf("size not 2/16 in video-v4l.cc packedUYVY422_to_planar422 MMX\n");
#endif
		return packedUYVY422_to_planarYUYV422_nonMMX(dest,destWidth,destHeight,
									  src, srcWidth, srcHeight);
	}
	char * aa=src;
	char * yy=dest;
	char * uu=dest+destWidth*destHeight;
	char * vv=uu+((destWidth*destHeight)>>1);
	if(destHeight!=srcHeight) {
		if(destHeight>srcHeight) {
			int difference=((destHeight-srcHeight)>>1);
			yy+=difference*destWidth;
			uu+=((difference*destWidth)>>1);
			vv+=((difference*destWidth)>>1);
		}
		else {
			int difference=((srcHeight-destHeight)>>1);
			aa+=((difference*srcWidth)<<1);
		}
	}
	if(destWidth!=srcWidth ) {
		if( (destWidth&0xf) || (srcWidth&0xf) ) {
#ifdef VERIFY_MMX
			printf("width mismatch not 16 in video-v4l.cc packedUYVY422_to_planar422 MMX\n");
#endif
			return packedUYVY422_to_planarYUYV422_nonMMX(dest,destWidth,destHeight,
					  src,srcWidth,srcHeight);
		}
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		leftPad&=~(0x3); // round to 16 pixels
		leftClip&=~(0x3); // round to 16 pixels
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		if(leftPad!=0 || rightPad!=0) {
			int wideLoops=srcWidth>>4; // divide by 16 pixels
			for(int i=rows; i>0; --i) {
				yy+=leftPad; // pad the left side
				uu+=(leftPad>>1);
				vv+=(leftPad>>1);
#ifndef WIN32
				planar32bytesUYVY422( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
				planar32bytesUYVY422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
				aa+=(srcWidth<<1); // advance the pointers
				yy+=srcWidth;
				uu+=(srcWidth>>1);
				vv+=(srcWidth>>1);

				yy+=rightPad; // pad the right side
				uu+=(rightPad>>1);
				vv+=(rightPad>>1);
			}
			issueEmms(); // send an Emms instruction to reset the MMX state
		}
		else { // clipping !=0;
			int wideLoops=destWidth>>4; // divide by 16 pixels
			for(int i=rows; i>0; --i) {
				aa+=(leftClip<<1); // clip the left side
#ifndef WIN32
				planar32bytesUYVY422( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
				planar32bytesUYVY422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
				aa+=(destWidth<<1); // advance the pointers
				yy+=destWidth;
				uu+=(destWidth>>1);
				vv+=(destWidth>>1);

				aa+=(rightClip<<1); // clip the right side
			}
			issueEmms(); // send an Emms instruction to reset the MMX state
		} // end clipping case
		return true;
	}
	else {
		int loops=((destWidth * rows)>>4);
#ifndef WIN32
		planar32bytesUYVY422( aa, yy, uu, vv, loops);
#else
		planar32bytesUYVY422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, loops);
#endif
		return true;
	}
}

#endif // end mmx case for packedUYVY422_to_planarYUYV422

//
//	packedYUYV422_to_planarYUYV411
//
// This function unpacks every frame into a planar form *and* reduces
//	the color subsampling from 4:2:2 to 4:1:1 by throwing out the
//	chroma information in every other line
#ifndef X86_ASSEMBLER
bool packedYUYV422_to_planarYUYV411(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#else
bool packedYUYV422_to_planarYUYV411_nonMMX(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#endif
#ifdef VERIFY_MMX
	printf("non-MMX: packedYUYV422_to_planarYUYV411\n");
#endif

	if( (destWidth&0x3) || (srcWidth&0x1) ) {
		printf("incorrect width in packedYUYV422_to_planarYUYV411_nonMMX\n");
		return false;
	}

	if(destWidth==srcWidth && destHeight==srcHeight) {
		int  a,b;
		char *s, *y, *u, *v;

		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>2);

		// packed representation is YUYV YUYV YUYV
		for (a = destHeight; a > 0; a--) {
			// The information we have is 4:2:2. The subsampling consists in
			// keeping the chroma info (U,V) for one pixel and throwing it for
			// the next sampled chroma. This is indeed 4:1:1 subsampling
			for (b = (destWidth>>2); b > 0; b--) {
				*(y++) = *(s++);
				*(u++) = *(s++);
				*(y++) = *(s++);
				*(v++) = *(s++);
				*(y++) = *(s++);
						s++ ;
				*(y++) = *(s++);
						s++ ;
			}
		}
	}
	else {
		int  a,b;
		char *s, *y, *u, *v;

		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>2);

		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		leftPad=((leftPad>>2)<<2); // make a multiple of 4
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;

		int rows=(srcHeight>destHeight)?destHeight:srcHeight;

		if( (leftPad&0x3) || (rightPad&0x3) ) {
			printf("horz padding must be multiple of 4 in packedYUYV422_to_planarYUYV411\n");
			return false;
		}

		y+=(destWidth*upPad);
		u+=((destWidth*upPad)>>2);
		v+=((destWidth*upPad)>>2);

		s+=((srcWidth*upClip)<<1);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary

			// packed representation is YUYV YUYV YUYV
			for (a = rows; a > 0; a--) {
				y+=leftPad;
				u+=(leftPad>>2);
				v+=(leftPad>>2);
				
				// The information we have is 4:2:2. The subsampling consists in
				// keeping the chroma info (U,V) for one pixel and throwing it for
				// the next sampled chroma. This is indeed 4:1:1 subsampling
				for (b = (srcWidth>>2); b > 0; b--) {
					*(y++) = *(s++);
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
					*(y++) = *(s++);
							s++ ;
					*(y++) = *(s++);
							s++ ;
				}
				y+=rightPad;
				u+=(rightPad>>2);
				v+=(rightPad>>2);
			}
		}
		else { // if clipping necessary
			// packed representation is YUYV YUYV YUYV
			for (a = rows; a > 0; a--) {
				s+=(leftClip<<1);

				// The information we have is 4:2:2. The subsampling consists in
				// keeping the chroma info (U,V) for one pixel and throwing it for
				// the next sampled chroma. This is indeed 4:1:1 subsampling
				for (b = (destWidth>>2); b > 0; b--) {
					*(y++) = *(s++);
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
					*(y++) = *(s++);
							s++ ;
					*(y++) = *(s++);
							s++ ;
				}
				s+=(rightClip<<1);
			}
		}
	}
	return true;
}
#ifdef X86_ASSEMBLER

#ifndef WIN32 // non-windows case

inline void planar32bytesYUYV411(char * src, char* yy, char* uu, char * vv, int loops, bool issueEmms=true)
{
	i64 __volatile__ andArray=0x00ff00ff00ff00ffLL;
	unsigned int __volatile__ array[5]={(int)src, int(yy), int(uu), int(vv), loops};
	unsigned int var=(unsigned int)(array);
	int __volatile__ dummy1,dummy2;

	__asm__ __volatile__(
		       "push %%ebx\n\t"
		       "push %%ecx\n\t"
		       "push %%edx\n\t"
		       "push %%esi\n\t"

		       "mov  4(%%eax), %%ebx\n\t"
		       "mov  8(%%eax), %%ecx\n\t"
		       "mov 12(%%eax), %%edx\n\t"
		       "mov 16(%%eax), %%esi\n\t"
		       "mov  0(%%eax), %%eax\n\t"

		       "movq   (%%edi), %%mm6\n\t"
		       "top_of_loop3:\n\t"
		       "movq   (%%eax), %%mm0\n\t"
		       "movq  8(%%eax), %%mm1\n\t"
		       "movq 16(%%eax), %%mm2\n\t"
		       "movq 24(%%eax), %%mm3\n\t"

		       "movq %%mm0, %%mm4\n\t"
		       "movq %%mm1, %%mm5\n\t"

				"pand %%mm6, %%mm4\n\t"
				"pand %%mm6, %%mm5\n\t"

			   "psrlw $8, %%mm0\n\t"
		       "packuswb %%mm5, %%mm4\n\t"
			   "psrlw $8, %%mm1\n\t"

		       "movq %%mm4, (%%ebx)\n\t"

		       "movq %%mm3, %%mm5\n\t"
		       "movq %%mm2, %%mm4\n\t"

   				"pand %%mm6, %%mm5\n\t"
				"pand %%mm6, %%mm4\n\t"

			   "psrlw $8, %%mm2\n\t"
		       "packuswb %%mm5, %%mm4\n\t"
			   "psrlw $8, %%mm3\n\t"
		       "movq %%mm4, 8(%%ebx)\n\t"

		       "packuswb %%mm1, %%mm0\n\t"
		       "packuswb %%mm3, %%mm2\n\t"

		       "movq %%mm0, %%mm4\n\t"
		       "movq %%mm2, %%mm5\n\t"

		       "psrlw $8, %%mm4\n\t"
		       "psrlw $8, %%mm5\n\t"

		       "pand %%mm6, %%mm0\n\t"

		       "packuswb %%mm5, %%mm4\n\t"
		       "pand %%mm6, %%mm2\n\t"

		   		"pand %%mm6, %%mm4\n\t"
				"packuswb %%mm4,%%mm4\n\t"
				"movd %%mm4,(%%edx)\n\t"

		       "packuswb %%mm2, %%mm0\n\t"
		       "dec %%esi\n\t"
		   		"pand %%mm6, %%mm0\n\t"
				"packuswb %%mm0,%%mm0\n\t"
				"movd %%mm0,(%%ecx)\n\t"

		       "add $32, %%eax\n\t"
		       "add $16, %%ebx\n\t"
		       "add $4, %%ecx\n\t"
		       "add $4, %%edx\n\t"
		       "cmp $0,%%esi\n\t"
		       "jg top_of_loop3\n\t"
		       // "emms\n\t"
		       "pop %%esi\n\t"
		       "pop %%edx\n\t"
		       "pop %%ecx\n\t"
		       "pop %%ebx\n\t"
		       : "=a" (dummy1), "=D" (dummy2)
		       : "a" (var) /*(array)*/ , "D" (&andArray)
		       : "memory");
	if(issueEmms) {
			__asm__ __volatile__(
				"emms\n\t"
				:
				: 
				:"memory");
	}
}

#else // windows case

inline void planar32bytesYUYV411(long * data, long* y, long* u, long* v, int loops, bool issueEmms=true)
{
	__int64 andArray=0x00ff00ff00ff00ff;
	_asm
	{	
		push edi	// push a register we really shouldn't mess with
		// Load pointers
		mov eax, [data]		// load ptr to source
		mov ebx, [y]		// location of Y dest
		mov ecx, [u]		// location of U dest
		mov edx, [v]		// location of V dest
		mov edi, [loops]	// load loop iterations
		movq mm6, [andArray]	// Load AND array

top_of_loop:		

		// Dereference pointers to load data into registers
		movq mm0, [eax]
		movq mm1, [eax+8]
		movq mm2, [eax+16]
		movq mm3, [eax+24]

		// Extract the first set of y's
		movq mm4, mm0		// Copy data to temp location
		movq mm5, mm1		// Copy data to temp location
		pand mm4, mm6		// Get high bits (Y) & AND U/V away
		pand mm5, mm6		// Get high bits (Y) & AND U/V away
		psrlw mm0, 8		// strip off Y's	<SA>
		packuswb mm4, mm5	// Pack first half of Y's together
		psrlw mm1, 8		// strip off Y's	<SA>
		movq [ebx], mm4		// Save Y's

		// Extract the second set of y's
		movq mm5, mm3		// Copy data to temp location
		movq mm4, mm2		// Copy data to temp location
		pand mm5, mm6		// Get high bits (Y) & AND U/V away
		pand mm4, mm6		// Get high bits (Y) & AND U/V away
		psrlw mm2, 8		// strip off Y's	<SA>
		packuswb mm4, mm5	// Pack second half of Y's together
		psrlw mm3, 8		// strip off Y's	<SA>
		movq [ebx+8], mm4	// Save Y's

		// Combine U's and V's together
		packuswb mm0,mm1	// Pack first half together
		packuswb mm2,mm3	// Pack second half together

		// Combine and save V's
		movq mm4, mm0		// Make a copy
		movq mm5, mm2		// Make a copy
		psrlw mm4, 8		// Shift down and strip off U's
		psrlw mm5, 8		// Shift down and strip off U's
		pand mm0, mm6		// strip off V's	<SA>
		packuswb mm4,mm5	// Pack Together
		pand mm2, mm6		// strip off V's <SA>

		pand mm4, mm6		// Shift down and strip out half of v's
		packuswb mm4,mm4	// Pack Together the remaining v's
		movd [edx], mm4		// Save

		// Combine and save U's
		packuswb mm0,mm2	// Pack together
		dec edi				// Decrement loop count <SA>
		pand mm0, mm6		// Shift down and strip out half of u's
		packuswb mm0,mm0	// Pack Together the remaining u's
		movd [ecx], mm0		// Save

		add eax, 32		// Advance pointer
		add ebx, 16		// Advance pointer

		add ecx, 4		// Advance pointer <SA>
		add edx, 4		// Advance pointer <SA>
		cmp edi,0		// Perform test
		jg top_of_loop	// jump back to top for another loop
//        emms            // empty MMX state
		pop edi			// restore the register
	}
	if(!issueEmms) return;{
		_asm
		{	
			emms            // empty MMX state
		}
	}
}
#endif // end of windows case

bool packedYUYV422_to_planarYUYV411(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#ifdef VERIFY_MMX
	printf("MMX: packedYUYV422_to_planarYUYV411\n");
#endif
        if(!supportsMMX()) {
		return packedYUYV422_to_planarYUYV411_nonMMX(dest,destWidth,destHeight,
				src,srcWidth,srcHeight);
	}
#if 0
	if(destWidth!=srcWidth ) {
#ifdef VERIFY_MMX
		printf("size mismatch in video-v4l.cc packedYUYV422_to_planar411 MMX\n");
#endif
		return packedYUYV422_to_planarYUYV411_nonMMX(dest,destWidth,destHeight,
				  src,srcWidth,srcHeight);
	}
#endif
	int rows=(destHeight<srcHeight)?destHeight:srcHeight;
	if( (srcWidth&0x1) || (destWidth&0x3) || (destWidth*rows)&0xf ) {
#ifdef VERIFY_MMX
		printf("size not 2/4/16 in video-v4l.cc packedYUYV422_to_planar411 MMX\n");
#endif
		return packedYUYV422_to_planarYUYV411_nonMMX(dest,destWidth,destHeight,
									  src, srcWidth, srcHeight);
	}
	char * aa=src;
	char * yy=dest;
	char * uu=dest+destWidth*destHeight;
	char * vv=uu+((destWidth*destHeight)>>2);
	if(destHeight!=srcHeight) {
		if(destHeight>srcHeight) {
			int difference=((destHeight-srcHeight)>>1);
			yy+=difference*destWidth;
			uu+=((difference*destWidth)>>2);
			vv+=((difference*destWidth)>>2);
		}
		else {
			int difference=((srcHeight-destHeight)>>1);
			aa+=((difference*srcWidth)<<1);
		}
	}
	if(destWidth!=srcWidth) {
		if( (destWidth&0xf) || (srcWidth&0xf) ) {
#ifdef VERIFY_MMX
			printf("width mismatch not 16 in video-v4l.cc packedYUYV422_to_planar411 MMX\n");
#endif
			return packedYUYV422_to_planarYUYV411_nonMMX(dest,destWidth,destHeight,
					  src,srcWidth,srcHeight);
		}
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		leftPad&=~(0x3); // round to 16 pixels
		leftClip&=~(0x3); // round to 16 pixels
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		if(leftPad!=0 || rightPad!=0) {
			int wideLoops=srcWidth>>4; // divide by 16 pixels
			for(int i=rows; i>0; --i) {
				yy+=leftPad; // pad the left side
				uu+=(leftPad>>2);
				vv+=(leftPad>>2);
#ifndef WIN32
				planar32bytesYUYV411( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
				planar32bytesYUYV411( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
				aa+=(srcWidth<<1); // advance the pointers
				yy+=srcWidth;
				uu+=(srcWidth>>2);
				vv+=(srcWidth>>2);

				yy+=rightPad; // pad the right side
				uu+=(rightPad>>2);
				vv+=(rightPad>>2);
			}
			issueEmms(); // send an Emms instruction to reset the MMX state
		}
		else { // clipping !=0;
			int wideLoops=destWidth>>4; // divide by 16 pixels
			for(int i=rows; i>0; --i) {
				aa+=(leftClip<<1); // clip the left side
#ifndef WIN32
				planar32bytesYUYV411( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
				planar32bytesYUYV411( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
				aa+=(destWidth<<1); // advance the pointers
				yy+=destWidth;
				uu+=(destWidth>>2);
				vv+=(destWidth>>2);

				aa+=(rightClip<<1); // clip the right side
			}
			issueEmms(); // send an Emms instruction to reset the MMX state
		} // end clipping case
		return true;
	}
	else {
		int loops=((destWidth * rows)>>4);
#ifndef WIN32
		planar32bytesYUYV411( aa, yy, uu, vv, loops);
#else
		planar32bytesYUYV411( (long*)aa, (long*)yy, (long*)uu, (long*)vv, loops);
#endif
		return true;
	}
}

#endif // end of MMX case for packedYUYV422_to_planarYUYV411

#ifndef X86_ASSEMBLER
bool packedUYVY422_to_planarYUYV411(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#else
bool packedUYVY422_to_planarYUYV411_nonMMX(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#endif
#ifdef VERIFY_MMX
	printf("non-MMX: packedUYVY422_to_planarYUYV411\n");
#endif
	if( (destWidth&0x3) || (srcWidth&0x1) ) {
		printf("wrong width in packedUYVY422_to_planarYUYV411_nonMMX\n");
		return false;
	}

	if(destWidth==srcWidth && destHeight==srcHeight) {
		int  a,b;
		char *s, *y,*u,*v;

		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>2);

		// packed representation is UYUV
		for (a = destHeight; a > 0; a--) {
			for (b = (destWidth>>2); b > 0; b--) {
				*(u++) = *(s++);
				*(y++) = *(s++);
				*(v++) = *(s++);
				*(y++) = *(s++);
						s++ ;
				*(y++) = *(s++);
					   s++ ;
				*(y++) = *(s++);
			}
		}
	}
	else {
		int  a,b;
		char *s, *y, *u, *v;

		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>2);

		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		leftPad=((leftPad>>2)<<2); // make a multiple of 4
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;

		int rows=(srcHeight>destHeight)?destHeight:srcHeight;

		if( (leftPad&0x3) || (rightPad&0x3) ) {
			printf("horz padding must be multiple of 4 in packedUYVY422_to_planarYUYV411\n");
			return false;
		}

		y+=(destWidth*upPad);
		u+=((destWidth*upPad)>>2);
		v+=((destWidth*upPad)>>2);

		s+=((srcWidth*upClip)<<1);

		if(leftPad!=0 || rightPad!=0) { // if padding necessary

			// packed representation is YUYV YUYV YUYV
			for (a = rows; a > 0; a--) {
				y+=leftPad;
				u+=(leftPad>>2);
				v+=(leftPad>>2);
				
				// The information we have is 4:2:2. The subsampling consists in
				// keeping the chroma info (U,V) for one pixel and throwing it for
				// the next sampled chroma. This is indeed 4:1:1 subsampling
				for (b = (srcWidth>>2); b > 0; b--) {
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
					*(y++) = *(s++);
							s++ ;
					*(y++) = *(s++);
							s++ ;
					*(y++) = *(s++);
				}
				y+=rightPad;
				u+=(rightPad>>2);
				v+=(rightPad>>2);
			}
		}
		else { // if clipping necessary
			// packed representation is YUYV YUYV YUYV
			for (a = rows; a > 0; a -= 1) {
				s+=(leftClip<<1);

				// The information we have is 4:2:2. The subsampling consists in
				// keeping the chroma info (U,V) for one pixel and throwing it for
				// the next sampled chroma. This is indeed 4:1:1 subsampling
				for (b = (destWidth>>2); b > 0; b--) {
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
					*(y++) = *(s++);
							s++ ;
					*(y++) = *(s++);
							s++ ;
					*(y++) = *(s++);
				}
				s+=(rightClip<<1);
			}
		}
	}
	return true;
}
#ifdef X86_ASSEMBLER

#ifndef WIN32 // non-windows case

inline void planar32bytesUYVY411(char * src, char* yy, char* uu, char * vv, int loops, bool issueEmms=true)
{
#ifdef VERIFY_MMX
	printf("MMX: planar32bytesUYVY411\n");
#endif
	i64 __volatile__ andArray=0x00ff00ff00ff00ffLL;
	unsigned int __volatile__ array[5]={(int)src, int(yy), int(uu), int(vv), loops};
	unsigned int var=(unsigned int)(array);
	int __volatile__ dummy1,dummy2;

	__asm__ __volatile__(
		       "push %%ebx\n\t"
		       "push %%ecx\n\t"
		       "push %%edx\n\t"
		       "push %%esi\n\t"

		       "mov  4(%%eax), %%ebx\n\t"
		       "mov  8(%%eax), %%ecx\n\t"
		       "mov 12(%%eax), %%edx\n\t"
		       "mov 16(%%eax), %%esi\n\t"
		       "mov  0(%%eax), %%eax\n\t"

		       "movq   (%%edi), %%mm6\n\t"
		       "top_of_loop4:\n\t"
		       "movq   (%%eax), %%mm0\n\t"
		       "movq  8(%%eax), %%mm1\n\t"
		       "movq 16(%%eax), %%mm2\n\t"
		       "movq 24(%%eax), %%mm3\n\t"

		       "movq %%mm0, %%mm4\n\t"
		       "movq %%mm1, %%mm5\n\t"

		       "psrlw $8, %%mm4\n\t"
		       "psrlw $8, %%mm5\n\t"

		       "pand %%mm6, %%mm0\n\t"
		       "packuswb %%mm5, %%mm4\n\t"
		       "pand %%mm6, %%mm1\n\t"
		       "movq %%mm4, (%%ebx)\n\t"

		       "movq %%mm3, %%mm5\n\t"
		       "movq %%mm2, %%mm4\n\t"

		       "psrlw $8, %%mm5\n\t"
		       "psrlw $8, %%mm4\n\t"

		       "pand %%mm6, %%mm2\n\t"
		       "packuswb %%mm5, %%mm4\n\t"
		       "pand %%mm6, %%mm3\n\t"
		       "movq %%mm4, 8(%%ebx)\n\t"

		       "packuswb %%mm1, %%mm0\n\t"
		       "packuswb %%mm3, %%mm2\n\t"

		       "movq %%mm0, %%mm4\n\t"
		       "movq %%mm2, %%mm5\n\t"

		       "psrlw $8, %%mm4\n\t"
		       "psrlw $8, %%mm5\n\t"

		       "pand %%mm6, %%mm0\n\t"

		       "packuswb %%mm5, %%mm4\n\t"
		       "pand %%mm6, %%mm2\n\t"

		   		"pand %%mm6, %%mm4\n\t"
				"packuswb %%mm4,%%mm4\n\t"
				"movd %%mm4,(%%edx)\n\t"

		       "packuswb %%mm2, %%mm0\n\t"
		       "dec %%esi\n\t"
		   		"pand %%mm6, %%mm0\n\t"
				"packuswb %%mm0,%%mm0\n\t"
				"movd %%mm0,(%%ecx)\n\t"

		       "add $32, %%eax\n\t"
		       "add $16, %%ebx\n\t"
		       "add $4, %%ecx\n\t"
		       "add $4, %%edx\n\t"
		       "cmp $0,%%esi\n\t"
		       "jg top_of_loop4\n\t"
	//	       "emms\n\t"
		       "pop %%esi\n\t"
		       "pop %%edx\n\t"
		       "pop %%ecx\n\t"
		       "pop %%ebx\n\t"
		       : "=a" (dummy1), "=D" (dummy2)
		       : "a" (var) /*(array)*/ , "D" (&andArray)
		       : "memory");
	if(issueEmms) {
			__asm__ __volatile__(
				"emms\n\t"
				:
				:
				:"memory");
	}
}

#else // windows case

inline void planar32bytesUYVY411(long * data, long* y, long* u, long* v, int loops, bool issueEmms=true)
{
	__int64 andArray=0x00ff00ff00ff00ff;
	_asm
	{	
		push edi	// push a register we really shouldn't mess with
		// Load pointers
		mov eax, [data]		// load ptr to source
		mov ebx, [y]		// location of Y dest
		mov ecx, [u]		// location of U dest
		mov edx, [v]		// location of V dest
		mov edi, [loops]	// load loop iterations
		movq mm6, [andArray]	// Load AND array

top_of_loop:		

		// Dereference pointers to load data into registers
		movq mm0, [eax]
		movq mm1, [eax+8]
		movq mm2, [eax+16]
		movq mm3, [eax+24]

		// Extract the first set of y's
		movq mm4, mm0		// Copy data to temp location
		movq mm5, mm1		// Copy data to temp location
		psrlw mm4, 8		// Get high bits (Y) & shift U/V away
		psrlw mm5, 8		// Get high bits (Y) & shift U/V away
		pand mm0, mm6		// strip off Y's	<SA>
		packuswb mm4, mm5	// Pack first half of Y's together
		pand mm1, mm6		// strip off Y's	<SA>
		movq [ebx], mm4		// Save Y's

		// Extract the second set of y's
		movq mm5, mm3		// Copy data to temp location
		movq mm4, mm2		// Copy data to temp location
		psrlw mm5, 8		// Get high bits (Y) & shift U/V away
		psrlw mm4, 8		// Get high bits (Y) & shift U/V away
		pand mm2, mm6		// strip off Y's	<SA>
		packuswb mm4, mm5	// Pack second half of Y's together
		pand mm3, mm6		// strip off Y's	<SA>
		movq [ebx+8], mm4	// Save Y's

		// Combine U's and V's together
		packuswb mm0,mm1	// Pack first half together
		packuswb mm2,mm3	// Pack second half together

		// Combine and save V's
		movq mm4, mm0		// Make a copy
		movq mm5, mm2		// Make a copy
		psrlw mm4, 8		// Shift down and strip off U's
		psrlw mm5, 8		// Shift down and strip off U's
		pand mm0, mm6		// strip off V's	<SA>
		packuswb mm4,mm5	// Pack Together
		pand mm2, mm6		// strip off V's <SA>

		pand mm4, mm6		// Shift down and strip out half of v's
		packuswb mm4,mm4	// Pack Together the remaining v's
		movd [edx], mm4		// Save

		// Combine and save U's
		packuswb mm0,mm2	// Pack together
		dec edi				// Decrement loop count <SA>
		pand mm0, mm6		// Shift down and strip out half of u's
		packuswb mm0,mm0	// Pack Together the remaining u's
		movd [ecx], mm0		// Save

		add eax, 32		// Advance pointer
		add ebx, 16		// Advance pointer

		add ecx, 4		// Advance pointer <SA>
		add edx, 4		// Advance pointer <SA>
		cmp edi,0		// Perform test
		jg top_of_loop	// jump back to top for another loop
//        emms            // empty MMX state
		pop edi			// restore the register
	}
	if(!issueEmms) return;{
		_asm
		{	
			emms            // empty MMX state
		}
	}
}

#endif

bool packedUYVY422_to_planarYUYV411(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#ifdef VERIFY_MMX
	printf("MMX: packedUYVY422_to_planarYUYV411\n");
#endif
        if(!supportsMMX()) {
		return packedUYVY422_to_planarYUYV411_nonMMX(dest,destWidth,destHeight,
							  src, srcWidth, srcHeight);
	}

	int rows=(destHeight<srcHeight)?destHeight:srcHeight;
	if( (srcWidth&0x1) || (destWidth&0x3) || (destWidth*rows)&0xf ) {
#ifdef VERIFY_MMX
		printf("size not 2/4/16 in video-v4l.cc packedUYVY422_to_planar411 MMX\n");
#endif
		return packedUYVY422_to_planarYUYV411_nonMMX(dest,destWidth,destHeight,
									  src, srcWidth, srcHeight);
	}
	char * aa=src;
	char * yy=dest;
	char * uu=dest+destWidth*destHeight;
	char * vv=uu+((destWidth*destHeight)>>2);
	if(destHeight!=srcHeight) {
		if(destHeight>srcHeight) {
			int difference=((destHeight-srcHeight)>>1);
			yy+=difference*destWidth;
			uu+=((difference*destWidth)>>2);
			vv+=((difference*destWidth)>>2);
		}
		else {
			int difference=((srcHeight-destHeight)>>1);
			aa+=((difference*srcWidth)<<1);
		}
	}
	if(destWidth!=srcWidth ) {
		if( (destWidth&0xf) || (srcWidth&0xf) ) {
#ifdef VERIFY_MMX
			printf("width mismatch not 16 in video-v4l.cc packedUYVY422_to_planar411 MMX\n");
#endif
			return packedUYVY422_to_planarYUYV411_nonMMX(dest,destWidth,destHeight,
					  src,srcWidth,srcHeight);
		}
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		leftPad&=~(0x3); // round to 16 pixels
		leftClip&=~(0x3); // round to 16 pixels
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;

		if(leftPad!=0 || rightPad!=0) {
			int wideLoops=srcWidth>>4; // divide by 16 pixels
			for(int i=rows; i>0; --i) {
				yy+=leftPad; // pad the left side
				uu+=(leftPad>>2);
				vv+=(leftPad>>2);
#ifndef WIN32
				planar32bytesUYVY411( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
				planar32bytesUYVY411( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
				aa+=(srcWidth<<1); // advance the pointers
				yy+=srcWidth;
				uu+=(srcWidth>>2);
				vv+=(srcWidth>>2);

				yy+=rightPad; // pad the right side
				uu+=(rightPad>>2);
				vv+=(rightPad>>2);
			}
			issueEmms(); // send an Emms instruction to reset the MMX state
		}
		else { // clipping !=0;
			int wideLoops=destWidth>>4; // divide by 16 pixels
			for(int i=rows; i>0; --i) {
				aa+=(leftClip<<1); // clip the left side
#ifndef WIN32
				planar32bytesUYVY411( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
				planar32bytesUYVY411( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
				aa+=(destWidth<<1); // advance the pointers
				yy+=destWidth;
				uu+=(destWidth>>2);
				vv+=(destWidth>>2);

				aa+=(rightClip<<1); // clip the right side
			}
			issueEmms(); // send an Emms instruction to reset the MMX state
		} // end clipping case
		return true;
	}
	else {
		int loops=( (destWidth * rows)>>4);
#ifndef WIN32
		planar32bytesUYVY411( aa, yy, uu, vv, loops);
#else
		planar32bytesUYVY411( (long*)aa, (long*)yy, (long*)uu, (long*)vv, loops);
#endif
		return true;
	}
}

#endif // end of mmx case for packedUYVY422_to_planarYUYV411

//
//	packedYUYV422_to_planarYUYV420
//
// This function unpacks every frame into a planar form *and* reduces
//	the color subsampling from 4:2:2 to 4:2:0 by throwing out the
//	chroma information in every other line
#ifndef X86_ASSEMBLER
bool packedYUYV422_to_planarYUYV420(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#else
bool packedYUYV422_to_planarYUYV420_nonMMX(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#endif
#ifdef VERIFY_MMX
	printf("non-MMX: packedYUYV422_to_planarYUYV420\n");
#endif
	if( (destHeight&0x1) || (srcHeight&0x1) ) {
		printf("even height required in packedYUYV422_to_planarYUYV420_nonMMX\n");
		return false;
	}
	if( (destWidth&0x1) || (srcWidth&0x1) ) {
		printf("even width required in packedYUYV422_to_planarYUYV420_nonMMX\n");
		return false;
	}

	if(destWidth==srcWidth && destHeight==srcHeight) {

		int  a,b;
		char *s, *y,*u,*v;

		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>2);

		// packed representation is YUYV YUYV YUYV
		for (a = (destHeight>>1); a > 0; a--) {
			// The information we have is 4:2:2. The subsampling consists in
			// keeping the chroma info from one line and throwing it out from
			// the next one. This is indeed 4:2:0 subsampling
			for (b = (destWidth>>1); b > 0; b--) {
				*(y++) = *(s++);
				*(u++) = *(s++);
				*(y++) = *(s++);
				*(v++) = *(s++);
			}
			for (b = (destWidth>>1); b > 0; b--) {
				*(y++) = *(s++);
						s++ ;
				*(y++) = *(s++);
						s++ ;
			}
		}
	}
	////////////////////////////////////////////////////////
	else { // non-matching size case
		
		int  a,b;
		char *s, *y,*u,*v;
		
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;
		
		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		if(upPad&0x1) { --upPad; }  // 4:2:0, so deal with even #'ed rows
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;
		
		int rows=(srcHeight>destHeight)?destHeight:srcHeight;
		
//		int downPad=(srcHeight<destHeight)?(destHeight-srcHeight-upPad):0;
		
		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>2);
		
		// handle the y up padding
		y+=(destWidth*upPad);
		u+=((destWidth*upPad)>>2);
		v+=((destWidth*upPad)>>2);
		
		// handle the y up clipping
		s+=((srcWidth*upClip)<<1);
		
		if(leftPad!=0 || rightPad!=0) { // if padding necessary 
			// packed representation is UYUV UYUV UYUV
			for (a = (rows>>1); a > 0; a--) {
				y+=leftPad;
				u+=(leftPad>>1);
				v+=(leftPad>>1);
				// The information we have is 4:2:2. The subsampling consists in
				// keeping the chroma info from one line and throwing it out from
				// the next one. This is indeed 4:2:0 subsampling
				for (b = (srcWidth>>1); b > 0; b--) {
					*(y++) = *(s++);
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
				}
				y+=rightPad;
				u+=(rightPad>>1);
				v+=(rightPad>>1);
				y+=leftPad;
				for (b = (srcWidth>>1); b > 0; b--) {
					*(y++) = *(s++);
					s++ ;
					*(y++) = *(s++);
					s++ ;
				}
				y+=rightPad;
			}
		}
		else { // clipping is necessary
			// packed representation is UYUV UYUV UYUV
			for (a = (rows>>1); a > 0; a--) {
				s+=(leftClip<<1);
				// The information we have is 4:2:2. The subsampling consists in
				// keeping the chroma info from one line and throwing it out from
				// the next one. This is indeed 4:2:0 subsampling
				for (b = (destWidth>>1); b > 0; b--) {
					*(y++) = *(s++);
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
				}
				s+=(rightClip<<1);
				s+=(leftClip<<1);
				for (b = (destWidth>>1); b > 0; b--) {
					*(y++) = *(s++);
					s++ ;
					*(y++) = *(s++);
					s++ ;
				}
				s+=(rightClip<<1);
			}
		}
	}
	return true;
}
#ifdef X86_ASSEMBLER

#ifndef WIN32

// just extracts the y's
inline void planar32bytesYUYV(char * src, char* yy, int loops, bool issueEmms=true)
{
	int i = loops;
	char * s = src;
	char * y = yy;
	i64 __volatile__ andArray=0x00ff00ff00ff00ffLL;
	unsigned int __volatile__ array[3]={(int)s, int(y), i};
	unsigned int var=(unsigned int)(array);
	int __volatile__ dummy1,dummy2;

	__asm__ __volatile__(
		       "push %%ebx\n\t"
		       "push %%ecx\n\t"
		       "push %%edx\n\t"
		       "push %%esi\n\t"

		       "mov  4(%%eax), %%ebx\n\t"
		       "mov  8(%%eax), %%esi\n\t"
		       "mov  0(%%eax), %%eax\n\t"

		       "movq   (%%edi), %%mm6\n\t"
		       "top_of_loop5:\n\t"

		       "movq   (%%eax), %%mm0\n\t"
		       "movq  8(%%eax), %%mm1\n\t"
		       "movq 16(%%eax), %%mm2\n\t"
		       "movq 24(%%eax), %%mm3\n\t"

		       "pand %%mm6, %%mm0\n\t"
		       "pand %%mm6, %%mm1\n\t"
		       "pand %%mm6, %%mm2\n\t"
		       "pand %%mm6, %%mm3\n\t"

		       "packuswb %%mm1, %%mm0\n\t"
		       "movq %%mm0, (%%ebx)\n\t"

		       "packuswb %%mm3, %%mm2\n\t"
		       "movq %%mm2, 8(%%ebx)\n\t"
		       "dec %%esi\n\t"
		       "add $32, %%eax\n\t"
		       "add $16, %%ebx\n\t"
		       "cmp $0,%%esi\n\t"
		       "jg top_of_loop5\n\t"
//		       "emms\n\t"
		       "pop %%esi\n\t"
		       "pop %%edx\n\t"
		       "pop %%ecx\n\t"
		       "pop %%ebx\n\t"
		       : "=a" (dummy1), "=D" (dummy2)
		       : "a" (var) /*(array)*/ , "D" (&andArray)
		       : "memory");
	if(issueEmms) {
			__asm__ __volatile__(
				"emms\n\t"
				:
				:
				:"memory");
	}
}

#else // windows case

// just extracts the y's
inline void planar32bytesYUYV(long * data, long* y, int loops, bool issueEmms=true)
{
	__int64 andArray=0x00ff00ff00ff00ff;
	_asm
	{	
		push edi	// push a register we really shouldn't mess with
		// Load pointers
		mov eax, [data]		// load ptr to source
		mov ebx, [y]		// location of Y dest
		mov edi, [loops]	// load loop iterations
		movq mm6, [andArray]	// Load AND array

top_of_loop:		

		// Dereference pointers to load data into registers
		movq mm0, [eax]
		movq mm1, [eax+8]
		movq mm2, [eax+16]
		movq mm3, [eax+24]

		// Extract the first set of y's
		pand mm0, mm6		// strip off U/V's	<SA>
		pand mm1, mm6		// strip off U/V's	<SA>
		pand mm2, mm6		// strip off U/V's	<SA>
		pand mm3, mm6		// strip off U/V's	<SA>

		packuswb mm0, mm1	// Pack first half of Y's together
		movq [ebx], mm0		// Save Y's
		packuswb mm2, mm3	// Pack first half of Y's together
		movq [ebx+8], mm2		// Save Y's
		dec edi				// Decrement loop count <SA>
		add eax, 32		// Advance pointer
		add ebx, 16		// Advance pointer
		cmp edi,0		// Perform test
		jg top_of_loop	// jump back to top for another loop
     //   emms            // empty MMX state
		pop edi			// restore the register
	}
	if(!issueEmms) return;{
		_asm
		{	
			emms            // empty MMX state
		}
	}
}

#endif // end windows case

bool packedYUYV422_to_planarYUYV420(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#ifdef VERIFY_MMX
	printf("MMX: packedYUYV422_to_planarYUYV420\n");
#endif
	if( !supportsMMX() ) {
		return packedYUYV422_to_planarYUYV420_nonMMX(dest,destWidth,destHeight,
				  src,srcWidth,srcHeight);
	}

	if( (srcWidth&0xf) || (destWidth&0xf) ) {
#ifdef VERIFY_MMX
		printf("width not 16 in video-v4l.cc packedYUYV422_to_planar420 MMX\n");
#endif
		return packedYUYV422_to_planarYUYV420_nonMMX(dest,destWidth,destHeight,
				  src,srcWidth,srcHeight);
	}
	int rows=(destHeight<srcHeight)?destHeight:srcHeight;
	if( (destHeight&0x1) || (srcHeight&0x1) ) {
#ifdef VERIFY_MMX
		printf("height not 2 in video-v4l.cc packedYUYV422_to_planar420 MMX\n");
#endif
		return packedYUYV422_to_planarYUYV420_nonMMX(dest,destWidth,destHeight,
				  src,srcWidth,srcHeight);
	}
	char * aa=src;
	char * yy=dest;
	char * uu=dest+destWidth*destHeight;
	char * vv=uu+((destWidth*destHeight)>>2);
	if(destHeight!=srcHeight) {
		if(destHeight>srcHeight) {
			int difference=((destHeight-srcHeight)>>2)<<1; // divide by two and mask off a bit
			yy+=difference*destWidth;
			uu+=((difference*destWidth)>>2);
			vv+=((difference*destWidth)>>2);
		}
		else {
			int difference=(srcHeight-destHeight)>>1;
			aa+=((difference*srcWidth)<<1);
		}
	}
	int wideLoops=(destWidth>>4);
	if(destWidth==srcWidth ) {
		for(int i=(rows>>1); i>0; --i) {
#ifndef WIN32
			planar32bytesYUYV422( aa, yy, uu, vv, wideLoops);
#else
			planar32bytesYUYV422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops);
#endif
			aa+=(destWidth<<1);
			yy+=destWidth;
			uu+=(destWidth>>1);
			vv+=(destWidth>>1);
#ifndef WIN32
			planar32bytesYUYV( aa, yy, wideLoops);
#else
			planar32bytesYUYV( (long*)aa, (long*)yy, wideLoops);
#endif
			aa+=(destWidth<<1);
			yy+=destWidth;
		}
	}
	else {
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;
		int rows=(srcHeight>destHeight)?destHeight:srcHeight;		
		if(leftPad!=0 || rightPad!=0) {
			int wideLoops=srcWidth>>4; // divide by 16 pixels
			for(int i=(rows>>1); i>0; --i) {
				yy+=leftPad;
				uu+=(leftPad>>1);
				vv+=(leftPad>>1);
#ifndef WIN32
				planar32bytesYUYV422( aa, yy, uu, vv, wideLoops);
#else
				planar32bytesYUYV422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops);
#endif
				aa+=(srcWidth<<1);
				yy+=srcWidth;
				uu+=(srcWidth>>1);
				vv+=(srcWidth>>1);

				yy+=rightPad;
				uu+=(rightPad>>1);
				vv+=(rightPad>>1);

				yy+=leftPad;
#ifndef WIN32
				planar32bytesYUYV( aa, yy, wideLoops);
#else
				planar32bytesYUYV( (long*)aa, (long*)yy, wideLoops);
#endif
				aa+=(srcWidth<<1);
				yy+=srcWidth;

				yy+=rightPad;
			}
		}
		else { // clipping !=0;
			for(int i=(rows>>1); i>0; --i) {
				aa+=(leftClip<<1);
#ifndef WIN32
				planar32bytesYUYV422( aa, yy, uu, vv, wideLoops);
#else
				planar32bytesYUYV422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops);
#endif
				aa+=(destWidth<<1);
				yy+=destWidth;
				uu+=(destWidth>>1);
				vv+=(destWidth>>1);

				aa+=(rightClip<<1);
				aa+=(leftClip<<1);

#ifndef WIN32
				planar32bytesYUYV( aa, yy, wideLoops);
#else
				planar32bytesYUYV( (long*)aa, (long*)yy, wideLoops);
#endif
				aa+=(destWidth<<1);
				yy+=destWidth;
				
				aa+=(rightClip<<1);
			}
		}
	}
	return true;
}

#endif // end of MMX case

//
//	packedUYVY422_to_planarYUYV420
//
// This function unpacks every frame into a planar form *and* reduces
//	the color subsampling from 4:2:2 to 4:2:0 by throwing out the
//	chroma information in every other line

#ifndef X86_ASSEMBLER
bool packedUYVY422_to_planarYUYV420(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#else
bool packedUYVY422_to_planarYUYV420_nonMMX(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#endif
#ifdef VERIFY_MMX
	printf("non-MMX: packedUYVY422_to_planarYUYV420\n");
#endif
	if( (destHeight&0x1) || (srcHeight&0x1) ) {
		printf("even height required in packedUYVY422_to_planarYUYV420\n");
		return false;
	}
	if( (destWidth&0x1) || (srcWidth&0x1) ) {
		printf("wrong width in packedUYVY422_to_planarYUYV420_nonMMX\n");
		return false;
	}
		
	if(destWidth==srcWidth && destHeight==srcHeight) {
		
		int  a,b;
		char *s, *y,*u,*v;
		
		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>2);
		
		// packed representation is UYUV UYUV UYUV
		for (a = (destHeight>>1); a > 0; a--) {
			// The information we have is 4:2:2. The subsampling consists in
			// keeping the chroma info from one line and throwing it out from
			// the next one. This is indeed 4:2:0 subsampling
			for (b = (destWidth>>1); b > 0; b--) {
				*(u++) = *(s++);
				*(y++) = *(s++);
				*(v++) = *(s++);
				*(y++) = *(s++);
			}
			for (b = (destWidth>>1); b > 0; b--) {
				s++ ;
				*(y++) = *(s++);
				s++ ;
				*(y++) = *(s++);
			}
		}
	}
	else { // non-matching size case
		
		int  a,b;
		char *s, *y,*u,*v;
		
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;
		
		int upPad=(srcHeight<destHeight)?((destHeight-srcHeight)>>1):0;
		if(upPad&0x1) {	--upPad; }
		int upClip=(srcHeight>destHeight)?((srcHeight-destHeight)>>1):0;
		//int downPad=(srcHeight<destHeight)?(destHeight-srcHeight-upPad):0;
		int rows=(srcHeight>destHeight)?destHeight:srcHeight;
		
		s = src;
		y = dest;
		u = y + destWidth * destHeight;
		v = u + ((destWidth * destHeight)>>2);
		
		// handle the y up padding
		y+=(destWidth*upPad);
		u+=((destWidth*upPad)>>2);
		v+=((destWidth*upPad)>>2);
		
		// handle the y up clipping
		s+=((srcWidth*upClip)<<1);
		
		if(leftPad!=0 || rightPad!=0) { // if padding non-zero
			// packed representation is UYUV UYUV UYUV
			for (a = (rows>>1); a > 0; a --) {
				y+=leftPad;
				u+=(leftPad>>1);
				v+=(leftPad>>1);
				// The information we have is 4:2:2. The subsampling consists in
				// keeping the chroma info from one line and throwing it out from
				// the next one. This is indeed 4:2:0 subsampling
				for (b = (srcWidth>>1); b > 0; b--) {
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
					*(y++) = *(s++);
				}
				y+=rightPad;
				u+=(rightPad>>1);
				v+=(rightPad>>1);
				y+=leftPad;
				for (b = (srcWidth>>1); b > 0; b--) {
					s++ ;
					*(y++) = *(s++);
					s++ ;
					*(y++) = *(s++);
				}
				y+=rightPad;
			}
		}
		else { // clipping is non-zero
			// packed representation is UYUV UYUV UYUV
			for (a = (rows>>1); a > 0; a--) {
				s+=(leftClip<<1);
				// The information we have is 4:2:2. The subsampling consists in
				// keeping the chroma info from one line and throwing it out from
				// the next one. This is indeed 4:2:0 subsampling
				for (b = (destWidth>>1); b > 0; b--) {
					*(u++) = *(s++);
					*(y++) = *(s++);
					*(v++) = *(s++);
					*(y++) = *(s++);
				}
				s+=(rightClip<<1);
				s+=(leftClip<<1);
				for (b = (destWidth>>1); b > 0; b--) {
					s++ ;
					*(y++) = *(s++);
					s++ ;
					*(y++) = *(s++);
				}
				s+=(rightClip<<1);
			}
		}
	}
	return true;
}
#ifdef X86_ASSEMBLER

#ifndef WIN32 // non-windows case
// just extracts the y's
inline void planar32bytesUYVY(char * src, char* yy, int loops, bool issueEmms=true)
{
	unsigned int __volatile__ array[3]={(int)src, int(yy), loops};
	unsigned int var=(unsigned int)(array);
	int __volatile__ dummy1;

	__asm__ __volatile__(
		       "push %%ebx\n\t"
		       //"push %%ecx\n\t"
		       //"push %%edx\n\t"
		       "push %%esi\n\t"

		       "mov  4(%%eax), %%ebx\n\t"
		       "mov  8(%%eax), %%esi\n\t"
		       "mov  0(%%eax), %%eax\n\t"

		       "top_of_loop6:\n\t"

		       "movq   (%%eax), %%mm0\n\t"
		       "movq  8(%%eax), %%mm1\n\t"
		       "movq 16(%%eax), %%mm2\n\t"
		       "movq 24(%%eax), %%mm3\n\t"

		       "psrlw $8, %%mm0\n\t"
		       "psrlw $8, %%mm1\n\t"
		       "psrlw $8, %%mm2\n\t"
		       "psrlw $8, %%mm3\n\t"

		       "packuswb %%mm1, %%mm0\n\t"
		       "movq %%mm0, (%%ebx)\n\t"

		       "packuswb %%mm3, %%mm2\n\t"
		       "movq %%mm2, 8(%%ebx)\n\t"
		       "dec %%esi\n\t"
		       "add $32, %%eax\n\t"
		       "add $16, %%ebx\n\t"
		       "cmp $0,%%esi\n\t"
		       "jg top_of_loop6\n\t"
	//	       "emms\n\t"
		       "pop %%esi\n\t"
		       //"pop %%edx\n\t"
		       //"pop %%ecx\n\t"
		       "pop %%ebx\n\t"
		       : "=a" (dummy1)
		       : "a" (var) /*(array)*/
		       : "memory");
	if(issueEmms) {
			__asm__ __volatile__(
				"emms\n\t"
				: 
				: 
				:"memory");
	}
}

#else // windows case

// just extracts the y's
inline void planar32bytesUYVY(long * data, long* y, int loops, bool issueEmms=true)
{
	__int64 andArray=0x00ff00ff00ff00ff;
	_asm
	{	
		push edi	// push a register we really shouldn't mess with
		// Load pointers
		mov eax, [data]		// load ptr to source
		mov ebx, [y]		// location of Y dest
		mov edi, [loops]	// load loop iterations
		movq mm6, [andArray]	// Load AND array

top_of_loop:		

		// Dereference pointers to load data into registers
		movq mm0, [eax]
		movq mm1, [eax+8]
		movq mm2, [eax+16]
		movq mm3, [eax+24]

		// Extract the first set of y's
		psrlw mm0, 8		// strip off U/V's
		psrlw mm1, 8		// strip off U/V's
		psrlw mm2, 8		// strip off U/V's
		psrlw mm3, 8		// strip off U/V's

		packuswb mm0, mm1	// Pack first half of Y's together
		movq [ebx], mm0		// Save Y's
		packuswb mm2, mm3	// Pack first half of Y's together
		movq [ebx+8], mm2		// Save Y's
		dec edi				// Decrement loop count <SA>
		add eax, 32		// Advance pointer
		add ebx, 16		// Advance pointer
		cmp edi,0		// Perform test
		jg top_of_loop	// jump back to top for another loop
//        emms            // empty MMX state
		pop edi			// restore the register
	}
	if(!issueEmms) return;{
		_asm
		{	
			emms            // empty MMX state
		}
	}

}

#endif // end windows case

bool packedUYVY422_to_planarYUYV420(char* dest, int destWidth, int destHeight,
									  char* src, int srcWidth, int srcHeight)
{
#ifdef VERIFY_MMX
	printf("MMX: packedUYVY422_to_planarYUYV420\n");
#endif
	if( !supportsMMX() ) {
		return packedUYVY422_to_planarYUYV420_nonMMX(dest,destWidth,destHeight,
				  src,srcWidth,srcHeight);
	}

	if( (srcWidth&0xf) || (destWidth&0xf) ) {
#ifdef VERIFY_MMX
		printf("width not 16 in video-v4l.cc packedUYVU422_to_planar420 MMX\n");
#endif
		return packedUYVY422_to_planarYUYV420_nonMMX(dest,destWidth,destHeight,
				  src,srcWidth,srcHeight);
	}
	int rows=(destHeight<srcHeight)?destHeight:srcHeight;
	if( (destHeight&0x1) || (srcHeight&0x1) ) {
#ifdef VERIFY_MMX
		printf("height not 2 in video-v4l.cc packedUYVU422_to_planar420 MMX\n");
#endif
		return packedUYVY422_to_planarYUYV420_nonMMX(dest,destWidth,destHeight,
				  src,srcWidth,srcHeight);
	}
	char * aa=src;
	char * yy=dest;
	char * uu=dest+destWidth*destHeight;
	char * vv=uu+((destWidth*destHeight)>>2);
	if(destHeight!=srcHeight) {
		if(destHeight>srcHeight) {
			int difference=((destHeight-srcHeight)>>2)<<1; // divide by two and mask off a bit
			yy+=difference*destWidth;
			uu+=((difference*destWidth)>>2);
			vv+=((difference*destWidth)>>2);
		}
		else {
			int difference=((srcHeight-destHeight)>>1);
			aa+=((difference*srcWidth)<<1);
		}
	}

	int wideLoops=destWidth>>4; // divide by 16 pixels
	if(destWidth==srcWidth ) {
		for(int i=(rows>>1); i>0; --i) {
#ifndef WIN32
			planar32bytesUYVY422( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
			planar32bytesUYVY422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
			aa+=destWidth<<1;
			yy+=destWidth;
			uu+=destWidth>>1;
			vv+=destWidth>>1;
#ifndef WIN32
			planar32bytesUYVY( aa, yy, wideLoops, EMMS_FLAG);
#else
			planar32bytesUYVY( (long*)aa, (long*)yy, wideLoops, EMMS_FLAG);
#endif
			aa+=destWidth<<1;
			yy+=destWidth;
		}
		issueEmms(); // send an Emms instruction to reset the MMX state
	}
	else {
		int leftPad=(srcWidth<destWidth)?((destWidth-srcWidth)>>1):0;
		int leftClip=(srcWidth>destWidth)?((srcWidth-destWidth)>>1):0;
		int rightPad=(srcWidth<destWidth)?(destWidth-srcWidth-leftPad):0;
		int rightClip=(srcWidth>destWidth)?(srcWidth-destWidth-leftClip):0;
		int rows=(srcHeight>destHeight)?destHeight:srcHeight;		
		if(leftPad!=0 || rightPad!=0) {
			int wideLoops=srcWidth>>4; // divide by 16 pixels
			for(int i=(rows>>1); i>0; --i) {
				yy+=leftPad; // left pad the output
				uu+=(leftPad>>1);
				vv+=(leftPad>>1);
#ifndef WIN32
				planar32bytesUYVY422( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
				planar32bytesUYVY422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
				aa+=(srcWidth<<1); // advance the pointers
				yy+=srcWidth;
				uu+=(srcWidth>>1);
				vv+=(srcWidth>>1);

				yy+=rightPad; // right pad the output
				uu+=(rightPad>>1);
				vv+=(rightPad>>1);

				yy+=leftPad; // left pad the output
#ifndef WIN32
				planar32bytesUYVY( aa, yy, wideLoops, EMMS_FLAG);
#else
				planar32bytesUYVY( (long*)aa, (long*)yy, wideLoops, EMMS_FLAG);
#endif
				aa+=(srcWidth<<1); // advance the pointers
				yy+=srcWidth;

				yy+=rightPad; // right pad the output
			}
			issueEmms(); // send an Emms instruction to reset the MMX state
		}
		else { // clipping !=0;
			for(int i=(rows>>1); i>0; --i) {
				aa+=(leftClip<<1);
#ifndef WIN32
				planar32bytesUYVY422( aa, yy, uu, vv, wideLoops, EMMS_FLAG);
#else
				planar32bytesUYVY422( (long*)aa, (long*)yy, (long*)uu, (long*)vv, wideLoops, EMMS_FLAG);
#endif
				aa+=(destWidth<<1);
				yy+=destWidth;
				uu+=(destWidth>>1);
				vv+=(destWidth>>1);

				aa+=(rightClip<<1);
				aa+=(leftClip<<1);
#ifndef WIN32
				planar32bytesUYVY( aa, yy, wideLoops, EMMS_FLAG);
#else
				planar32bytesUYVY( (long*)aa, (long*)yy, wideLoops, EMMS_FLAG);
#endif
				aa+=(destWidth<<1);
				yy+=destWidth;

				aa+=(rightClip<<1);
			}
			issueEmms(); // send an Emms instruction to reset the MMX state
		}
	}
	return true;
}

#endif // end of MMX case
