//  -----------------------------------------------------------------------------------------
//    g x264 o(GUI) Ex  v1.xx/2.xx by rigaya
//  -----------------------------------------------------------------------------------------
//   \[XR[hɂ
//   Eۏ؂łB
//   E{\[XR[hgpƂɂ邢Ȃ鑹QEguɂrigaya͐ӔC𕉂܂B
//   ȏɗĒꍇA{\[XR[h̎gpAAρAĔЕzsĒč\܂B
//  -----------------------------------------------------------------------------------------

#include <Windows.h>
#include <mmintrin.h>  //CgVbN SSE
#include <emmintrin.h> //CgVbN SSE2
#include <tmmintrin.h> //CgVbN SSSE3
#include <smmintrin.h> //CgVbN SSE4.1

/// -- AVX߂𐶐邽߂̃t@C ------------------------------------------------------
///
///    RpCIvV /arch:AVX gp
///      ֐̂̓RsyĂĊ֐avxɂ邾̊ȒPȂd
///
/// -----------------------------------------------------------------------------------------

#include "convert.h"
#include "convert_const.h"

//AVXVC++2010
#if (_MSC_VER >= 1600)
void convert_yuy2_to_nv12_avx_mod16(void *frame, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	int x, y;
	BYTE *p, *pw, *Y, *C;
	BYTE *dst_Y = pixel_data->data[0];
	BYTE *dst_C = pixel_data->data[1];
	__m128i x0, x1, x2, x3;
	for (y = 0; y < height; y += 2) {
		x  = y * width;
		p  = (BYTE *)frame + (x<<1);
		pw = p + (width<<1);
		Y  = (BYTE *)dst_Y +  x;
		C  = (BYTE *)dst_C + (x>>1);
		for (x = 0; x < width; x += 16, p += 32, pw += 32) {
			//-----------1s---------------
			x0 = _mm_loadu_si128((const __m128i *)(p+ 0));    // VYUYVYUYVYUYVYUY
			x1 = _mm_loadu_si128((const __m128i *)(p+16));    // VYUYVYUYVYUYVYUY

			_mm_prefetch((const char *)pw, _MM_HINT_T1);

			x2 = _mm_unpacklo_epi8(x0, x1); //VVYYUUYYVVYYUUYY
			x1 = _mm_unpackhi_epi8(x0, x1); //VVYYUUYYVVYYUUYY

			x0 = _mm_unpacklo_epi8(x2, x1); //VVVVYYYYUUUUYYYY
			x1 = _mm_unpackhi_epi8(x2, x1); //VVVVYYYYUUUUYYYY

			x2 = _mm_unpacklo_epi8(x0, x1); //UUUUUUUUYYYYYYYY
			x1 = _mm_unpackhi_epi8(x0, x1); //VVVVVVVVYYYYYYYY

			x0 = _mm_unpacklo_epi8(x2, x1); //YYYYYYYYYYYYYYYY
			x3 = _mm_unpackhi_epi8(x2, x1); //VUVUVUVUVUVUVUVU

			_mm_stream_si128((__m128i *)(Y + x), x0);
			//-----------1sڏI---------------

			//-----------2s---------------
			x0 = _mm_loadu_si128((const __m128i *)(pw+ 0));    // VYUYVYUYVYUYVYUY
			x1 = _mm_loadu_si128((const __m128i *)(pw+16));    // VYUYVYUYVYUYVYUY

			x2 = _mm_unpacklo_epi8(x0, x1); //VVYYUUYYVVYYUUYY
			x1 = _mm_unpackhi_epi8(x0, x1); //VVYYUUYYVVYYUUYY

			x0 = _mm_unpacklo_epi8(x2, x1); //VVVVYYYYUUUUYYYY
			x1 = _mm_unpackhi_epi8(x2, x1); //VVVVYYYYUUUUYYYY

			x2 = _mm_unpacklo_epi8(x0, x1); //UUUUUUUUYYYYYYYY
			x1 = _mm_unpackhi_epi8(x0, x1); //VVVVVVVVYYYYYYYY

			x0 = _mm_unpacklo_epi8(x2, x1); //YYYYYYYYYYYYYYYY
			x1 = _mm_unpackhi_epi8(x2, x1); //VUVUVUVUVUVUVUVU

			_mm_stream_si128((__m128i *)(Y + width + x), x0);
			//-----------2sڏI---------------

			x1 = _mm_avg_epu8(x1, x3);  //VUVUVUVUVUVUVUVU
			_mm_stream_si128((__m128i *)(C + x), x1);
		}
	}
}

void convert_yuy2_to_nv12_avx(void *frame, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	int x, y;
	BYTE *p, *pw, *Y, *C;
	BYTE *dst_Y = pixel_data->data[0];
	BYTE *dst_C = pixel_data->data[1];
	__m128i x0, x1, x2, x3;
	for (y = 0; y < height; y += 2) {
		x  = y * width;
		p  = (BYTE *)frame + (x<<1);
		pw = p + (width<<1);
		Y  = (BYTE *)dst_Y +  x;
		C  = (BYTE *)dst_C + (x>>1);
		for (x = 0; x < width; x += 16, p += 32, pw += 32) {
			//-----------1s---------------
			x0 = _mm_loadu_si128((const __m128i *)(p+ 0));    // VYUYVYUYVYUYVYUY
			x1 = _mm_loadu_si128((const __m128i *)(p+16));    // VYUYVYUYVYUYVYUY

			_mm_prefetch((const char *)pw, _MM_HINT_T1);

			x2 = _mm_unpacklo_epi8(x0, x1); //VVYYUUYYVVYYUUYY
			x1 = _mm_unpackhi_epi8(x0, x1); //VVYYUUYYVVYYUUYY

			x0 = _mm_unpacklo_epi8(x2, x1); //VVVVYYYYUUUUYYYY
			x1 = _mm_unpackhi_epi8(x2, x1); //VVVVYYYYUUUUYYYY

			x2 = _mm_unpacklo_epi8(x0, x1); //UUUUUUUUYYYYYYYY
			x1 = _mm_unpackhi_epi8(x0, x1); //VVVVVVVVYYYYYYYY

			x0 = _mm_unpacklo_epi8(x2, x1); //YYYYYYYYYYYYYYYY
			x3 = _mm_unpackhi_epi8(x2, x1); //VUVUVUVUVUVUVUVU

			_mm_storeu_si128((__m128i *)(Y + x), x0);
			//-----------1sڏI---------------

			//-----------2s---------------
			x0 = _mm_loadu_si128((const __m128i *)(pw+ 0));    // VYUYVYUYVYUYVYUY
			x1 = _mm_loadu_si128((const __m128i *)(pw+16));    // VYUYVYUYVYUYVYUY

			x2 = _mm_unpacklo_epi8(x0, x1); //VVYYUUYYVVYYUUYY
			x1 = _mm_unpackhi_epi8(x0, x1); //VVYYUUYYVVYYUUYY

			x0 = _mm_unpacklo_epi8(x2, x1); //VVVVYYYYUUUUYYYY
			x1 = _mm_unpackhi_epi8(x2, x1); //VVVVYYYYUUUUYYYY

			x2 = _mm_unpacklo_epi8(x0, x1); //UUUUUUUUYYYYYYYY
			x1 = _mm_unpackhi_epi8(x0, x1); //VVVVVVVVYYYYYYYY

			x0 = _mm_unpacklo_epi8(x2, x1); //YYYYYYYYYYYYYYYY
			x1 = _mm_unpackhi_epi8(x2, x1); //VUVUVUVUVUVUVUVU

			_mm_storeu_si128((__m128i *)(Y + width + x), x0);
			//-----------2sڏI---------------

			x1 = _mm_avg_epu8(x1, x3);  //VUVUVUVUVUVUVUVU
			_mm_storeu_si128((__m128i *)(C + x), x1);
		}
	}
}

void convert_yuy2_to_nv12_i_avx_mod16(void *frame, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	int x, y, i;
	BYTE *p, *pw, *Y, *C;
	BYTE *dst_Y = pixel_data->data[0];
	BYTE *dst_C = pixel_data->data[1];
	__m128i x0, x1, x2, x3;
	for (y = 0; y < height; y += 4) {
		for (i = 0; i < 2; i++) {
			x  = (y + i) * width;
			p  = (BYTE *)frame + (x<<1);
			pw = p + (width<<2);
			Y  = (BYTE *)dst_Y +  x;
			C  = (BYTE *)dst_C + ((x+width*i)>>1);
			for (x = 0; x < width; x += 16, p += 32, pw += 32) {
				//-----------    1+is   ---------------
				x0 = _mm_loadu_si128((__m128i *)(p+ 0));    // VYUYVYUYVYUYVYUY
				x1 = _mm_loadu_si128((__m128i *)(p+16));    // VYUYVYUYVYUYVYUY

				_mm_prefetch((const char *)pw, _MM_HINT_T1);

				x2 = _mm_unpacklo_epi8(x0, x1); //VVYYUUYYVVYYUUYY
				x1 = _mm_unpackhi_epi8(x0, x1); //VVYYUUYYVVYYUUYY

				x0 = _mm_unpacklo_epi8(x2, x1); //VVVVYYYYUUUUYYYY
				x1 = _mm_unpackhi_epi8(x2, x1); //VVVVYYYYUUUUYYYY

				x2 = _mm_unpacklo_epi8(x0, x1); //UUUUUUUUYYYYYYYY
				x1 = _mm_unpackhi_epi8(x0, x1); //VVVVVVVVYYYYYYYY

				x0 = _mm_unpacklo_epi8(x2, x1); //YYYYYYYYYYYYYYYY
				x3 = _mm_unpackhi_epi8(x2, x1); //VUVUVUVUVUVUVUVU

				_mm_stream_si128((__m128i *)(Y + x), x0);
				//-----------1+isڏI---------------

				//-----------3+is---------------
				x0 = _mm_loadu_si128((__m128i *)(pw+ 0));    // VYUYVYUYVYUYVYUY
				x1 = _mm_loadu_si128((__m128i *)(pw+16));    // VYUYVYUYVYUYVYUY

				x2 = _mm_unpacklo_epi8(x0, x1); //VVYYUUYYVVYYUUYY
				x1 = _mm_unpackhi_epi8(x0, x1); //VVYYUUYYVVYYUUYY

				x0 = _mm_unpacklo_epi8(x2, x1); //VVVVYYYYUUUUYYYY
				x1 = _mm_unpackhi_epi8(x2, x1); //VVVVYYYYUUUUYYYY

				x2 = _mm_unpacklo_epi8(x0, x1); //UUUUUUUUYYYYYYYY
				x1 = _mm_unpackhi_epi8(x0, x1); //VVVVVVVVYYYYYYYY

				x0 = _mm_unpacklo_epi8(x2, x1); //YYYYYYYYYYYYYYYY
				x1 = _mm_unpackhi_epi8(x2, x1); //VUVUVUVUVUVUVUVU

				_mm_stream_si128((__m128i *)(Y + (width<<1) + x), x0);
				//-----------3+isڏI---------------

				x0 = _mm_unpacklo_epi8(x1, x3);
				x1 = _mm_unpackhi_epi8(x1, x3);
				x0 = _mm_maddubs_epi16(x0, xC_INTERLACE_WEIGHT(i));
				x1 = _mm_maddubs_epi16(x1, xC_INTERLACE_WEIGHT(i));
				x0 = _mm_add_epi16(x0, _mm_set1_epi16(2));
				x1 = _mm_add_epi16(x1, _mm_set1_epi16(2));
				x0 = _mm_srai_epi16(x0, 2);
				x1 = _mm_srai_epi16(x1, 2);
				x0 = _mm_packus_epi16(x0, x1); //VUVUVUVUVUVUVUVU
				_mm_stream_si128((__m128i *)(C + x), x0);
			}
		}
	}
}

void convert_yuy2_to_nv12_i_avx(void *frame, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	int x, y, i;
	BYTE *p, *pw, *Y, *C;
	BYTE *dst_Y = pixel_data->data[0];
	BYTE *dst_C = pixel_data->data[1];
	__m128i x0, x1, x2, x3;
	for (y = 0; y < height; y += 4) {
		for (i = 0; i < 2; i++) {
			x  = (y + i) * width;
			p  = (BYTE *)frame + (x<<1);
			pw  = p   + (width<<2);
			Y  = (BYTE *)dst_Y +  x;
			C  = (BYTE *)dst_C + ((x+width*i)>>1);
			for (x = 0; x < width; x += 16, p += 32, pw += 32) {
				//-----------    1s   ---------------
				x0 = _mm_loadu_si128((__m128i *)(p+ 0));    // VYUYVYUYVYUYVYUY
				x1 = _mm_loadu_si128((__m128i *)(p+16));    // VYUYVYUYVYUYVYUY

				_mm_prefetch((const char *)pw, _MM_HINT_T1);

				x2 = _mm_unpacklo_epi8(x0, x1); //VVYYUUYYVVYYUUYY
				x1 = _mm_unpackhi_epi8(x0, x1); //VVYYUUYYVVYYUUYY

				x0 = _mm_unpacklo_epi8(x2, x1); //VVVVYYYYUUUUYYYY
				x1 = _mm_unpackhi_epi8(x2, x1); //VVVVYYYYUUUUYYYY

				x2 = _mm_unpacklo_epi8(x0, x1); //UUUUUUUUYYYYYYYY
				x1 = _mm_unpackhi_epi8(x0, x1); //VVVVVVVVYYYYYYYY

				x0 = _mm_unpacklo_epi8(x2, x1); //YYYYYYYYYYYYYYYY
				x3 = _mm_unpackhi_epi8(x2, x1); //VUVUVUVUVUVUVUVU

				_mm_storeu_si128((__m128i *)(Y + x), x0);
				//-----------1sڏI---------------

				//-----------3s---------------
				x0 = _mm_loadu_si128((__m128i *)(pw+ 0));    // VYUYVYUYVYUYVYUY
				x1 = _mm_loadu_si128((__m128i *)(pw+16));    // VYUYVYUYVYUYVYUY

				x2 = _mm_unpacklo_epi8(x0, x1); //VVYYUUYYVVYYUUYY
				x1 = _mm_unpackhi_epi8(x0, x1); //VVYYUUYYVVYYUUYY

				x0 = _mm_unpacklo_epi8(x2, x1); //VVVVYYYYUUUUYYYY
				x1 = _mm_unpackhi_epi8(x2, x1); //VVVVYYYYUUUUYYYY

				x2 = _mm_unpacklo_epi8(x0, x1); //UUUUUUUUYYYYYYYY
				x1 = _mm_unpackhi_epi8(x0, x1); //VVVVVVVVYYYYYYYY

				x0 = _mm_unpacklo_epi8(x2, x1); //YYYYYYYYYYYYYYYY
				x1 = _mm_unpackhi_epi8(x2, x1); //VUVUVUVUVUVUVUVU

				_mm_storeu_si128((__m128i *)(Y + (width<<1) + x), x0);
				//-----------3sڏI---------------

				x0 = _mm_unpacklo_epi8(x1, x3);
				x1 = _mm_unpackhi_epi8(x1, x3);
				x0 = _mm_maddubs_epi16(x0, xC_INTERLACE_WEIGHT(i));
				x1 = _mm_maddubs_epi16(x1, xC_INTERLACE_WEIGHT(i));
				x0 = _mm_add_epi16(x0, _mm_set1_epi16(2));
				x1 = _mm_add_epi16(x1, _mm_set1_epi16(2));
				x0 = _mm_srai_epi16(x0, 2);
				x1 = _mm_srai_epi16(x1, 2);
				x0 = _mm_packus_epi16(x0, x1); //VUVUVUVUVUVUVUVU
				_mm_storeu_si128((__m128i *)(C + x), x0);
			}
		}
	}
}

void convert_yc48_to_nv12_16bit_avx(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	int x, y;
	short *dst_Y = (short *)pixel_data->data[0];
	short *dst_C = (short *)pixel_data->data[1];
	short *ycp, *ycpw;
	short *Y = NULL, *C = NULL;
	const int MASK_INT_Y  = 0x80 + 0x10 + 0x02;
	const int MASK_INT_UV = 0x40 + 0x20 + 0x01;
	__m128i x0, x1, x2, x3, x6, x7;
	for (y = 0; y < height; y += 2) {
		ycp = (short*)pixel + width * y * 3;
		ycpw= ycp + width*3;
		Y   = (short*)dst_Y + width * y;
		C   = (short*)dst_C + width * y / 2;
		for (x = 0; x < width; x += 8, ycp += 24, ycpw += 24) {
			x1 = _mm_loadu_si128((__m128i *)(ycp +  0));
			x2 = _mm_loadu_si128((__m128i *)(ycp +  8));
			x3 = _mm_loadu_si128((__m128i *)(ycp + 16));

			_mm_prefetch((const char *)ycpw, _MM_HINT_T1);

			x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
			x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
			x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

			x6 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
			x6 = _mm_blend_epi16(x6, x3, MASK_INT_UV>>2);
			x6 = _mm_alignr_epi8(x6, x6, 2);
			x6 = _mm_shuffle_epi32(x6, _MM_SHUFFLE(1, 2, 3, 0));//UV1s

				//Y 1sڌvZ
				x1 = _mm_cmpeq_epi8(x1, x1);
				x1 = _mm_srli_epi16(x1, RSFT_ONE);
				x7 = _mm_unpackhi_epi16(x0, x1);
				x0 = _mm_unpacklo_epi16(x0, x1);

				x0 = _mm_madd_epi16(x0, xC_Y_L_MA_16);
				x7 = _mm_madd_epi16(x7, xC_Y_L_MA_16);
				x0 = _mm_srai_epi32(x0, Y_L_RSH_16);
				x7 = _mm_srai_epi32(x7, Y_L_RSH_16);
				x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
				x0 = _mm_add_epi32(x0, x1);
				x7 = _mm_add_epi32(x7, x1);

				x0 = _mm_packus_epi32(x0, x7);

			_mm_storeu_si128((__m128i *)(Y + x), x0);

			x1 = _mm_loadu_si128((__m128i *)(ycpw +  0));
			x2 = _mm_loadu_si128((__m128i *)(ycpw +  8));
			x3 = _mm_loadu_si128((__m128i *)(ycpw + 16));

			x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
			x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
			x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

			x1 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
			x1 = _mm_blend_epi16(x1, x3, MASK_INT_UV>>2); 
			x1 = _mm_alignr_epi8(x1, x1, 2);
			x2 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1, 2, 3, 0));//UV2s

			//UV 1s + 2s
			x6 = _mm_add_epi16(x2, x6);
				
				//Y 2sڌvZ
				x1 = _mm_cmpeq_epi8(x1, x1);
				x1 = _mm_srli_epi16(x1, RSFT_ONE);
				x7 = _mm_unpackhi_epi16(x0, x1);
				x0 = _mm_unpacklo_epi16(x0, x1);

				x0 = _mm_madd_epi16(x0, xC_Y_L_MA_16);
				x7 = _mm_madd_epi16(x7, xC_Y_L_MA_16);
				x0 = _mm_srai_epi32(x0, Y_L_RSH_16);
				x7 = _mm_srai_epi32(x7, Y_L_RSH_16);
				x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
				x0 = _mm_add_epi32(x0, x1);
				x7 = _mm_add_epi32(x7, x1);

				x0 = _mm_packus_epi32(x0, x7);

			_mm_storeu_si128((__m128i *)(Y + x + width), x0);

			x0 = _mm_add_epi16(x6, _mm_set1_epi16(UV_OFFSET_x2));

			x1 = _mm_cmpeq_epi8(x1, x1);
			x1 = _mm_srli_epi16(x1, RSFT_ONE);
			x7 = _mm_unpackhi_epi16(x0, x1);
			x0 = _mm_unpacklo_epi16(x0, x1);

			x0 = _mm_madd_epi16(x0, xC_UV_L_MA_16_420P);
			x7 = _mm_madd_epi16(x7, xC_UV_L_MA_16_420P);
			x0 = _mm_srai_epi32(x0, UV_L_RSH_16_420P);
			x7 = _mm_srai_epi32(x7, UV_L_RSH_16_420P);
			x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
			x0 = _mm_add_epi32(x0, x1);
			x7 = _mm_add_epi32(x7, x1);

			x0 = _mm_packus_epi32(x0, x7);

			_mm_storeu_si128((__m128i *)(C + x), x0);
		}
	}
}

void convert_yc48_to_nv12_i_16bit_avx(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	int x, y, i;
	short *dst_Y = (short *)pixel_data->data[0];
	short *dst_C = (short *)pixel_data->data[1];
	short *ycp, *ycpw;
	short *Y = NULL, *C = NULL;
	const int MASK_INT_Y  = 0x80 + 0x10 + 0x02;
	const int MASK_INT_UV = 0x40 + 0x20 + 0x01;
	__m128i x0, x1, x2, x3, x6, x7;
	for (y = 0; y < height; y += 4) {
		for (i = 0; i < 2; i++) {
			ycp = (short*)pixel + width * (y + i) * 3;
			ycpw= ycp + width*2*3;
			Y   = (short*)dst_Y + width * (y + i);
			C   = (short*)dst_C + width * (y + i*2) / 2;
			for (x = 0; x < width; x += 8, ycp += 24, ycpw += 24) {
				x1 = _mm_loadu_si128((__m128i *)(ycp +  0));
				x2 = _mm_loadu_si128((__m128i *)(ycp +  8));
				x3 = _mm_loadu_si128((__m128i *)(ycp + 16));

				_mm_prefetch((const char *)ycpw, _MM_HINT_T1);

				x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
				x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
				x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

				x6 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
				x6 = _mm_blend_epi16(x6, x3, MASK_INT_UV>>2);
				x6 = _mm_alignr_epi8(x6, x6, 2);
				x6 = _mm_shuffle_epi32(x6, _MM_SHUFFLE(1, 2, 3, 0));//UV1s

					//Y 1sڌvZ
					x1 = _mm_cmpeq_epi8(x1, x1);
					x1 = _mm_srli_epi16(x1, RSFT_ONE);
					x7 = _mm_unpackhi_epi16(x0, x1);
					x0 = _mm_unpacklo_epi16(x0, x1);

					x0 = _mm_madd_epi16(x0, xC_Y_L_MA_16);
					x7 = _mm_madd_epi16(x7, xC_Y_L_MA_16);
					x0 = _mm_srai_epi32(x0, Y_L_RSH_16);
					x7 = _mm_srai_epi32(x7, Y_L_RSH_16);
					x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
					x0 = _mm_add_epi32(x0, x1);
					x7 = _mm_add_epi32(x7, x1);

					x0 = _mm_packus_epi32(x0, x7);

					_mm_storeu_si128((__m128i *)(Y + x), x0);

				x1 = _mm_loadu_si128((__m128i *)(ycpw +  0));
				x2 = _mm_loadu_si128((__m128i *)(ycpw +  8));
				x3 = _mm_loadu_si128((__m128i *)(ycpw + 16));

				x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
				x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
				x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

				x1 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
				x1 = _mm_blend_epi16(x1, x3, MASK_INT_UV>>2);
				x1 = _mm_alignr_epi8(x1, x1, 2);
				x2 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1, 2, 3, 0));//UV3+is

					//Y 3sڌvZ
					x1 = _mm_cmpeq_epi8(x1, x1);
					x1 = _mm_srli_epi16(x1, RSFT_ONE);
					x7 = _mm_unpackhi_epi16(x0, x1);
					x0 = _mm_unpacklo_epi16(x0, x1);

					x0 = _mm_madd_epi16(x0, xC_Y_L_MA_16);
					x7 = _mm_madd_epi16(x7, xC_Y_L_MA_16);
					x0 = _mm_srai_epi32(x0, Y_L_RSH_16);
					x7 = _mm_srai_epi32(x7, Y_L_RSH_16);
					x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
					x0 = _mm_add_epi32(x0, x1);
					x7 = _mm_add_epi32(x7, x1);

					x0 = _mm_packus_epi32(x0, x7);

					_mm_storeu_si128((__m128i *)(Y + x + width*2), x0);

				x6 = _mm_add_epi16(x6, _mm_set1_epi16(UV_OFFSET_x1));
				x2 = _mm_add_epi16(x2, _mm_set1_epi16(UV_OFFSET_x1));

				x1 = _mm_cmpeq_epi8(x1, x1);
				x1 = _mm_srli_epi16(x1, RSFT_ONE);

				x7 = _mm_unpackhi_epi16(x6, x1);
				x6 = _mm_unpacklo_epi16(x6, x1);
				x3 = _mm_unpackhi_epi16(x2, x1);
				x2 = _mm_unpacklo_epi16(x2, x1);

				x6 = _mm_madd_epi16(x6, xC_UV_L_MA_16_420I(i));
				x7 = _mm_madd_epi16(x7, xC_UV_L_MA_16_420I(i));
				x2 = _mm_madd_epi16(x2, xC_UV_L_MA_16_420I((i+1)&0x01));
				x3 = _mm_madd_epi16(x3, xC_UV_L_MA_16_420I((i+1)&0x01));
				x0 = _mm_add_epi32(x6, x2);
				x7 = _mm_add_epi32(x7, x3);
				x0 = _mm_srai_epi32(x0, UV_L_RSH_16_420I);
				x7 = _mm_srai_epi32(x7, UV_L_RSH_16_420I);
				x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
				x0 = _mm_add_epi32(x0, x1);
				x7 = _mm_add_epi32(x7, x1);

				x0 = _mm_packus_epi32(x0, x7);

				_mm_storeu_si128((__m128i *)(C + x), x0);
			}
		}
	}
}

void convert_yc48_to_nv12_16bit_avx_mod8(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	int x, y;
	short *dst_Y = (short *)pixel_data->data[0];
	short *dst_C = (short *)pixel_data->data[1];
	short *ycp, *ycpw;
	short *Y = NULL, *C = NULL;
	const int MASK_INT_Y  = 0x80 + 0x10 + 0x02;
	const int MASK_INT_UV = 0x40 + 0x20 + 0x01;
	__m128i x0, x1, x2, x3, x6, x7;
	for (y = 0; y < height; y += 2) {
		ycp = (short*)pixel + width * y * 3;
		ycpw= ycp + width*3;
		Y   = (short*)dst_Y + width * y;
		C   = (short*)dst_C + width * y / 2;
		for (x = 0; x < width; x += 8, ycp += 24, ycpw += 24) {
			x1 = _mm_loadu_si128((__m128i *)(ycp +  0));
			x2 = _mm_loadu_si128((__m128i *)(ycp +  8));
			x3 = _mm_loadu_si128((__m128i *)(ycp + 16));

			_mm_prefetch((const char *)ycpw, _MM_HINT_T1);

			x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
			x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
			x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

			x6 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
			x6 = _mm_blend_epi16(x6, x3, MASK_INT_UV>>2); 
			x6 = _mm_alignr_epi8(x6, x6, 2);
			x6 = _mm_shuffle_epi32(x6, _MM_SHUFFLE(1, 2, 3, 0));//UV1s

				//Y 1sڌvZ
				x1 = _mm_cmpeq_epi8(x1, x1);
				x1 = _mm_srli_epi16(x1, RSFT_ONE);
				x7 = _mm_unpackhi_epi16(x0, x1);
				x0 = _mm_unpacklo_epi16(x0, x1);

				x0 = _mm_madd_epi16(x0, xC_Y_L_MA_16);
				x7 = _mm_madd_epi16(x7, xC_Y_L_MA_16);
				x0 = _mm_srai_epi32(x0, Y_L_RSH_16);
				x7 = _mm_srai_epi32(x7, Y_L_RSH_16);
				x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
				x0 = _mm_add_epi32(x0, x1);
				x7 = _mm_add_epi32(x7, x1);

				x0 = _mm_packus_epi32(x0, x7);

			_mm_stream_si128((__m128i *)(Y + x), x0);

			x1 = _mm_loadu_si128((__m128i *)(ycpw +  0));
			x2 = _mm_loadu_si128((__m128i *)(ycpw +  8));
			x3 = _mm_loadu_si128((__m128i *)(ycpw + 16));

			x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
			x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
			x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

			x1 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
			x1 = _mm_blend_epi16(x1, x3, MASK_INT_UV>>2); 
			x1 = _mm_alignr_epi8(x1, x1, 2);
			x2 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1, 2, 3, 0));//UV3+is

			//UV 1s + 2s
			x6 = _mm_add_epi16(x2, x6);
				
				//Y 2sڌvZ
				x1 = _mm_cmpeq_epi8(x1, x1);
				x1 = _mm_srli_epi16(x1, RSFT_ONE);
				x7 = _mm_unpackhi_epi16(x0, x1);
				x0 = _mm_unpacklo_epi16(x0, x1);

				x0 = _mm_madd_epi16(x0, xC_Y_L_MA_16);
				x7 = _mm_madd_epi16(x7, xC_Y_L_MA_16);
				x0 = _mm_srai_epi32(x0, Y_L_RSH_16);
				x7 = _mm_srai_epi32(x7, Y_L_RSH_16);
				x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
				x0 = _mm_add_epi32(x0, x1);
				x7 = _mm_add_epi32(x7, x1);

				x0 = _mm_packus_epi32(x0, x7);

			_mm_stream_si128((__m128i *)(Y + x + width), x0);

			x0 = _mm_add_epi16(x6, _mm_set1_epi16(UV_OFFSET_x2));

			x1 = _mm_cmpeq_epi8(x1, x1);
			x1 = _mm_srli_epi16(x1, RSFT_ONE);
			x7 = _mm_unpackhi_epi16(x0, x1);
			x0 = _mm_unpacklo_epi16(x0, x1);

			x0 = _mm_madd_epi16(x0, xC_UV_L_MA_16_420P);
			x7 = _mm_madd_epi16(x7, xC_UV_L_MA_16_420P);
			x0 = _mm_srai_epi32(x0, UV_L_RSH_16_420P);
			x7 = _mm_srai_epi32(x7, UV_L_RSH_16_420P);
			x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
			x0 = _mm_add_epi32(x0, x1);
			x7 = _mm_add_epi32(x7, x1);

			x0 = _mm_packus_epi32(x0, x7);

			_mm_stream_si128((__m128i *)(C + x), x0);
		}
	}
}

void convert_yc48_to_nv12_i_16bit_avx_mod8(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	int x, y, i;
	short *dst_Y = (short *)pixel_data->data[0];
	short *dst_C = (short *)pixel_data->data[1];
	short *ycp, *ycpw;
	short *Y = NULL, *C = NULL;
	const int MASK_INT_Y  = 0x80 + 0x10 + 0x02;
	const int MASK_INT_UV = 0x40 + 0x20 + 0x01;
	__m128i x0, x1, x2, x3, x6, x7;
	for (y = 0; y < height; y += 4) {
		for (i = 0; i < 2; i++) {
			ycp = (short*)pixel + width * (y + i) * 3;
			ycpw= ycp + width*2*3;
			Y   = (short*)dst_Y + width * (y + i);
			C   = (short*)dst_C + width * (y + i*2) / 2;
			for (x = 0; x < width; x += 8, ycp += 24, ycpw += 24) {
				x1 = _mm_loadu_si128((__m128i *)(ycp +  0));
				x2 = _mm_loadu_si128((__m128i *)(ycp +  8));
				x3 = _mm_loadu_si128((__m128i *)(ycp + 16));

				_mm_prefetch((const char *)ycpw, _MM_HINT_T1);

				x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
				x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
				x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

				x6 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
				x6 = _mm_blend_epi16(x6, x3, MASK_INT_UV>>2);
				x6 = _mm_alignr_epi8(x6, x6, 2);
				x6 = _mm_shuffle_epi32(x6, _MM_SHUFFLE(1, 2, 3, 0));//UV1s

					//Y 1sڌvZ
					x1 = _mm_cmpeq_epi8(x1, x1);
					x1 = _mm_srli_epi16(x1, RSFT_ONE);
					x7 = _mm_unpackhi_epi16(x0, x1);
					x0 = _mm_unpacklo_epi16(x0, x1);

					x0 = _mm_madd_epi16(x0, xC_Y_L_MA_16);
					x7 = _mm_madd_epi16(x7, xC_Y_L_MA_16);
					x0 = _mm_srai_epi32(x0, Y_L_RSH_16);
					x7 = _mm_srai_epi32(x7, Y_L_RSH_16);
					x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
					x0 = _mm_add_epi32(x0, x1);
					x7 = _mm_add_epi32(x7, x1);

					x0 = _mm_packus_epi32(x0, x7);

					_mm_stream_si128((__m128i *)(Y + x), x0);

				x1 = _mm_loadu_si128((__m128i *)(ycpw +  0));
				x2 = _mm_loadu_si128((__m128i *)(ycpw +  8));
				x3 = _mm_loadu_si128((__m128i *)(ycpw + 16));

				x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
				x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
				x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

				x1 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
				x1 = _mm_blend_epi16(x1, x3, MASK_INT_UV>>2); 
				x1 = _mm_alignr_epi8(x1, x1, 2);
				x2 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1, 2, 3, 0));//UV3+is

					//Y 3sڌvZ
					x1 = _mm_cmpeq_epi8(x1, x1);
					x1 = _mm_srli_epi16(x1, RSFT_ONE);
					x7 = _mm_unpackhi_epi16(x0, x1);
					x0 = _mm_unpacklo_epi16(x0, x1);

					x0 = _mm_madd_epi16(x0, xC_Y_L_MA_16);
					x7 = _mm_madd_epi16(x7, xC_Y_L_MA_16);
					x0 = _mm_srai_epi32(x0, Y_L_RSH_16);
					x7 = _mm_srai_epi32(x7, Y_L_RSH_16);
					x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
					x0 = _mm_add_epi32(x0, x1);
					x7 = _mm_add_epi32(x7, x1);

					x0 = _mm_packus_epi32(x0, x7);

					_mm_stream_si128((__m128i *)(Y + x + width*2), x0);

				x6 = _mm_add_epi16(x6, _mm_set1_epi16(UV_OFFSET_x1));
				x2 = _mm_add_epi16(x2, _mm_set1_epi16(UV_OFFSET_x1));

				x1 = _mm_cmpeq_epi8(x1, x1);
				x1 = _mm_srli_epi16(x1, RSFT_ONE);

				x7 = _mm_unpackhi_epi16(x6, x1);
				x6 = _mm_unpacklo_epi16(x6, x1);
				x3 = _mm_unpackhi_epi16(x2, x1);
				x2 = _mm_unpacklo_epi16(x2, x1);

				x6 = _mm_madd_epi16(x6, xC_UV_L_MA_16_420I(i));
				x7 = _mm_madd_epi16(x7, xC_UV_L_MA_16_420I(i));
				x2 = _mm_madd_epi16(x2, xC_UV_L_MA_16_420I((i+1)&0x01));
				x3 = _mm_madd_epi16(x3, xC_UV_L_MA_16_420I((i+1)&0x01));
				x0 = _mm_add_epi32(x6, x2);
				x7 = _mm_add_epi32(x7, x3);
				x0 = _mm_srai_epi32(x0, UV_L_RSH_16_420I);
				x7 = _mm_srai_epi32(x7, UV_L_RSH_16_420I);
				x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
				x0 = _mm_add_epi32(x0, x1);
				x7 = _mm_add_epi32(x7, x1);

				x0 = _mm_packus_epi32(x0, x7);

				_mm_stream_si128((__m128i *)(C + x), x0);
			}
		}
	}
}

void convert_yc48_to_yuv444_16bit_avx(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	short *Y = (short *)pixel_data->data[0];
	short *U = (short *)pixel_data->data[1];
	short *V = (short *)pixel_data->data[2];
	short *ycp;
	short *const ycp_fin = (short *)pixel + width * height * 3;
	const int MASK_INT = 0x40 + 0x08 + 0x01;
	__m128i x0, x1, x2, x3, x6, x7;
	for (ycp = (short *)pixel; ycp < ycp_fin; ycp += 24, Y += 8, U += 8, V += 8) {
		x1 = _mm_loadu_si128((__m128i *)(ycp +  0));
		x2 = _mm_loadu_si128((__m128i *)(ycp +  8));
		x3 = _mm_loadu_si128((__m128i *)(ycp + 16));

		x0 = _mm_blend_epi16(x3, x1, MASK_INT);
		x6 = _mm_blend_epi16(x2, x3, MASK_INT);
		x7 = _mm_blend_epi16(x1, x2, MASK_INT);

		x0 = _mm_blend_epi16(x0, x2, MASK_INT<<1);
		x6 = _mm_blend_epi16(x6, x1, MASK_INT<<1);
		x7 = _mm_blend_epi16(x7, x3, MASK_INT<<1);

		x1 = xC_SUFFLE_YCP_Y;
		x0 = _mm_shuffle_epi8(x0, x1);
		x1 = _mm_alignr_epi8(x1, x1, 6);
		x6 = _mm_shuffle_epi8(x6, x1);
		x1 = _mm_alignr_epi8(x1, x1, 6);
		x3 = _mm_shuffle_epi8(x7, x1);

			//YvZ
			x1 = _mm_cmpeq_epi8(x1, x1);
			x1 = _mm_srli_epi16(x1, RSFT_ONE);
			x7 = _mm_unpackhi_epi16(x0, x1);
			x0 = _mm_unpacklo_epi16(x0, x1);

			x0 = _mm_madd_epi16(x0, xC_Y_L_MA_16);
			x7 = _mm_madd_epi16(x7, xC_Y_L_MA_16);
			x0 = _mm_srai_epi32(x0, Y_L_RSH_16);
			x7 = _mm_srai_epi32(x7, Y_L_RSH_16);
			x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
			x0 = _mm_add_epi32(x0, x1);
			x7 = _mm_add_epi32(x7, x1);

			x0 = _mm_packus_epi32(x0, x7);

		_mm_stream_si128((__m128i *)Y, x0);

			//UvZ
			x0 = _mm_add_epi16(x6, _mm_set1_epi16(UV_OFFSET_x1));

			x1 = _mm_cmpeq_epi8(x1, x1);
			x1 = _mm_srli_epi16(x1, RSFT_ONE);
			x7 = _mm_unpackhi_epi16(x0, x1);
			x0 = _mm_unpacklo_epi16(x0, x1);

			x0 = _mm_madd_epi16(x0, xC_UV_L_MA_16_444);
			x7 = _mm_madd_epi16(x7, xC_UV_L_MA_16_444);
			x0 = _mm_srai_epi32(x0, UV_L_RSH_16_444);
			x7 = _mm_srai_epi32(x7, UV_L_RSH_16_444);
			x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
			x0 = _mm_add_epi32(x0, x1);
			x7 = _mm_add_epi32(x7, x1);

			x0 = _mm_packus_epi32(x0, x7);

		_mm_stream_si128((__m128i *)U, x0);

			//VvZ
			x0 = _mm_add_epi16(x3, _mm_set1_epi16(UV_OFFSET_x1));

			x1 = _mm_cmpeq_epi8(x1, x1);
			x1 = _mm_srli_epi16(x1, RSFT_ONE);
			x7 = _mm_unpackhi_epi16(x0, x1);
			x0 = _mm_unpacklo_epi16(x0, x1);

			x0 = _mm_madd_epi16(x0, xC_UV_L_MA_16_444);
			x7 = _mm_madd_epi16(x7, xC_UV_L_MA_16_444);
			x0 = _mm_srai_epi32(x0, UV_L_RSH_16_444);
			x7 = _mm_srai_epi32(x7, UV_L_RSH_16_444);
			x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
			x0 = _mm_add_epi32(x0, x1);
			x7 = _mm_add_epi32(x7, x1);

			x0 = _mm_packus_epi32(x0, x7);

		_mm_stream_si128((__m128i *)V, x0);
	}
}

void convert_yuy2_to_nv16_avx(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	BYTE *p = (BYTE *)pixel;
	BYTE * const p_fin = p + width * height * 2;
	BYTE *dst_Y = pixel_data->data[0];
	BYTE *dst_C = pixel_data->data[1];
	__m128i x0, x1, x2;
	for (; p < p_fin; p += 32, dst_Y += 16, dst_C += 16) {
		x0 = _mm_loadu_si128((__m128i *)(p+ 0));    // VYUYVYUYVYUYVYUY
		x1 = _mm_loadu_si128((__m128i *)(p+16));    // VYUYVYUYVYUYVYUY

		x2 = _mm_unpacklo_epi8(x0, x1); //VVYYUUYYVVYYUUYY
		x1 = _mm_unpackhi_epi8(x0, x1); //VVYYUUYYVVYYUUYY

		x0 = _mm_unpacklo_epi8(x2, x1); //VVVVYYYYUUUUYYYY
		x1 = _mm_unpackhi_epi8(x2, x1); //VVVVYYYYUUUUYYYY

		x2 = _mm_unpacklo_epi8(x0, x1); //UUUUUUUUYYYYYYYY
		x1 = _mm_unpackhi_epi8(x0, x1); //VVVVVVVVYYYYYYYY

		x0 = _mm_unpacklo_epi8(x2, x1); //YYYYYYYYYYYYYYYY
		x1 = _mm_unpackhi_epi8(x2, x1); //VUVUVUVUVUVUVUVU

		_mm_stream_si128((__m128i *)dst_Y, x0);
		_mm_stream_si128((__m128i *)dst_C, x1);
	}
}

void convert_yc48_to_nv16_16bit_avx(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	short *dst_Y = (short *)pixel_data->data[0];
	short *dst_C = (short *)pixel_data->data[1];
	short *ycp = (short *)pixel;
	short * const ycp_fin = ycp + width * height * 3;
	const int MASK_INT_Y  = 0x80 + 0x10 + 0x02;
	const int MASK_INT_UV = 0x40 + 0x20 + 0x01;
	__m128i x0, x1, x2, x3;
	for (; ycp < ycp_fin; ycp += 24, dst_Y += 8, dst_C += 8) {
		x1 = _mm_loadu_si128((__m128i *)(ycp +  0));
		x2 = _mm_loadu_si128((__m128i *)(ycp +  8));
		x3 = _mm_loadu_si128((__m128i *)(ycp + 16));

		x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
		x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
		x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

		x1 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
		x1 = _mm_blend_epi16(x1, x3, MASK_INT_UV>>2);
		x1 = _mm_alignr_epi8(x1, x1, 2);
		x3 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1, 2, 3, 0));//UVs

			//Y vZ
			x1 = _mm_cmpeq_epi8(x1, x1);
			x1 = _mm_srli_epi16(x1, RSFT_ONE);
			x2 = _mm_unpackhi_epi16(x0, x1);
			x0 = _mm_unpacklo_epi16(x0, x1);

			x0 = _mm_madd_epi16(x0, xC_Y_L_MA_16);
			x2 = _mm_madd_epi16(x2, xC_Y_L_MA_16);
			x0 = _mm_srai_epi32(x0, Y_L_RSH_16);
			x2 = _mm_srai_epi32(x2, Y_L_RSH_16);
			x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
			x0 = _mm_add_epi32(x0, x1);
			x2 = _mm_add_epi32(x2, x1);

			x0 = _mm_packus_epi32(x0, x2);

		_mm_stream_si128((__m128i *)dst_Y, x0);

		x0 = _mm_add_epi16(x3, _mm_set1_epi16(UV_OFFSET_x1));

		x1 = _mm_cmpeq_epi8(x1, x1);
		x1 = _mm_srli_epi16(x1, RSFT_ONE);
		x2 = _mm_unpackhi_epi16(x0, x1);
		x0 = _mm_unpacklo_epi16(x0, x1);

		x0 = _mm_madd_epi16(x0, xC_UV_L_MA_16_444);
		x2 = _mm_madd_epi16(x2, xC_UV_L_MA_16_444);
		x0 = _mm_srai_epi32(x0, UV_L_RSH_16_444);
		x2 = _mm_srai_epi32(x2, UV_L_RSH_16_444);
		x1 = _mm_unpacklo_epi16(_mm_slli_epi16(x1, LSFT_YCC_16), _mm_setzero_si128()); //32bit YCC
		x0 = _mm_add_epi32(x0, x1);
		x2 = _mm_add_epi32(x2, x1);

		x0 = _mm_packus_epi32(x0, x2);

		_mm_stream_si128((__m128i *)dst_C, x0);
	}
}

void convert_lw48_to_nv12_16bit_avx(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	__m128i x0, x1, x2, x3, x4;
	const int MASK_INT_Y  = 0x80 + 0x10 + 0x02;
	const int MASK_INT_UV = 0x40 + 0x20 + 0x01;
	USHORT *y_line = (USHORT *)pixel_data->data[0];
	USHORT *c_line = (USHORT *)pixel_data->data[1];
	PIXEL_LW48 *ycp_line = (PIXEL_LW48 *)pixel;
	for (int y = 0; y < height; y += 2, ycp_line += width*2, y_line += width*2, c_line += width) {
		BYTE *ycp = (BYTE *)ycp_line;
		BYTE *ycp_w = (BYTE *)((PIXEL_LW48 *)ycp_line + width);
		USHORT *dst_y = y_line;
		USHORT *dst_c = c_line;
		USHORT *dst_y_fin = dst_y + width;
		for ( ; dst_y < dst_y_fin; ycp += 48, ycp_w += 48, dst_y += 8, dst_c += 8) {
			x1 = _mm_load_si128((__m128i *)(ycp +  0));
			x2 = _mm_load_si128((__m128i *)(ycp + 16));
			x3 = _mm_load_si128((__m128i *)(ycp + 32));

			x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
			x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
			x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

			x4 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
			x4 = _mm_blend_epi16(x4, x3, MASK_INT_UV>>2);
			x4 = _mm_alignr_epi8(x4, x4, 2);
			x4 = _mm_shuffle_epi32(x4, _MM_SHUFFLE(1, 2, 3, 0));//UVs

			_mm_storeu_si128((__m128i*)dst_y, x0);
		
			x1 = _mm_load_si128((__m128i *)(ycp_w +  0));
			x2 = _mm_load_si128((__m128i *)(ycp_w + 16));
			x3 = _mm_load_si128((__m128i *)(ycp_w + 32));

			x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
			x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
			x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

			x1 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
			x1 = _mm_blend_epi16(x1, x3, MASK_INT_UV>>2);
			x1 = _mm_alignr_epi8(x1, x1, 2);
			x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1, 2, 3, 0));//UVs
		
			_mm_storeu_si128((__m128i*)(dst_y + width), x0);
		
			x1 = _mm_avg_epu16(x1, x4);
		
			_mm_storeu_si128((__m128i*)dst_c, x4);
		}
	}
}
void convert_lw48_to_nv12_i_16bit_avx(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	__m128i x0, x1, x2, x3, x4;
	const int MASK_INT_Y  = 0x80 + 0x10 + 0x02;
	const int MASK_INT_UV = 0x40 + 0x20 + 0x01;
	USHORT *y_data = (USHORT *)pixel_data->data[0];
	USHORT *c_data = (USHORT *)pixel_data->data[1];
	for (int y = 0; y < height; y += 4) {
		for (int i = 0; i < 2; i++) {
			BYTE *ycp = (BYTE *)((PIXEL_LW48 *)pixel + (y+i)*width);
			BYTE *ycp_w = (BYTE *)((PIXEL_LW48 *)ycp + width*2);
			USHORT *dst_y = y_data + (y+i)*width;
			USHORT *dst_c = c_data + ((y>>1)+i)*width;
			USHORT *dst_y_fin = dst_y + width;
			for ( ; dst_y < dst_y_fin; ycp += 48, ycp_w += 48, dst_y += 8, dst_c += 8) {
				x1 = _mm_load_si128((__m128i *)(ycp +  0));
				x2 = _mm_load_si128((__m128i *)(ycp + 16));
				x3 = _mm_load_si128((__m128i *)(ycp + 32));

				x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
				x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
				x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

				x4 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
				x4 = _mm_blend_epi16(x4, x3, MASK_INT_UV>>2);
				x4 = _mm_alignr_epi8(x4, x4, 2);
				x4 = _mm_shuffle_epi32(x4, _MM_SHUFFLE(1, 2, 3, 0));//UVs

				_mm_storeu_si128((__m128i*)dst_y, x0);
		
				x1 = _mm_load_si128((__m128i *)(ycp_w +  0));
				x2 = _mm_load_si128((__m128i *)(ycp_w + 16));
				x3 = _mm_load_si128((__m128i *)(ycp_w + 32));

				x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
				x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
				x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

				x1 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
				x1 = _mm_blend_epi16(x1, x3, MASK_INT_UV>>2);
				x1 = _mm_alignr_epi8(x1, x1, 2);
				x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1, 2, 3, 0));//UVs
		
				_mm_storeu_si128((__m128i*)(dst_y + width*2), x0);

				x0 = _mm_unpacklo_epi16(x4, x1);
				x1 = _mm_unpackhi_epi16(x4, x1);
				x0 = _mm_madd_epi16(x0, xC_INTERLACE_WEIGHT(i));
				x1 = _mm_madd_epi16(x1, xC_INTERLACE_WEIGHT(i));
				x0 = _mm_srli_epi32(x0, 2);
				x1 = _mm_srli_epi32(x1, 2);
				x1 = _mm_packus_epi32(x0, x1);
		
				_mm_storeu_si128((__m128i*)dst_c, x1);
			}
		}
	}
}
void convert_lw48_to_nv16_16bit_avx(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	__m128i x0, x1, x2, x3;
	USHORT *dst_Y = (USHORT *)pixel_data->data[0];
	USHORT *dst_C = (USHORT *)pixel_data->data[1];
	BYTE *ycp = (BYTE *)pixel;
	BYTE * const ycp_fin = ycp + width * height * 6;
	const int MASK_INT_Y  = 0x80 + 0x10 + 0x02;
	const int MASK_INT_UV = 0x40 + 0x20 + 0x01;
	for (; ycp < ycp_fin; ycp += 48, dst_Y += 8, dst_C += 8) {
		x1 = _mm_loadu_si128((__m128i *)(ycp +  0));
		x2 = _mm_loadu_si128((__m128i *)(ycp + 16));
		x3 = _mm_loadu_si128((__m128i *)(ycp + 32));

		x0 = _mm_blend_epi16(x1, x2, MASK_INT_Y);
		x0 = _mm_blend_epi16(x0, x3, MASK_INT_Y>>2);
		x0 = _mm_shuffle_epi8(x0, xC_SUFFLE_YCP_Y);

		x1 = _mm_blend_epi16(x1, x2, MASK_INT_UV);
		x1 = _mm_blend_epi16(x1, x3, MASK_INT_UV>>2);
		x1 = _mm_alignr_epi8(x1, x1, 2);
		x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1, 2, 3, 0));//UVs

		_mm_storeu_si128((__m128i *)dst_Y, x0);
		_mm_storeu_si128((__m128i *)dst_C, x1);
	}
}
void convert_lw48_to_yuv444_avx(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	__m128i x0, x1, x2, x3, x6, x7;
	const int MASK_INT = 0x40 + 0x08 + 0x01;
	BYTE *dst_y = (BYTE *)pixel_data->data[0];
	BYTE *dst_u = (BYTE *)pixel_data->data[1];
	BYTE *dst_v = (BYTE *)pixel_data->data[2];
	BYTE *ycp = (BYTE *)pixel;
	BYTE * const ycp_fin = ycp + width * height * 6;
	__m128i xY, xU, xV;
	for ( ; ycp < ycp_fin; ycp += 96, dst_y += 16, dst_u += 16, dst_v += 16) {
		x1 = _mm_loadu_si128((__m128i *)(ycp +  0));
		x2 = _mm_loadu_si128((__m128i *)(ycp + 16));
		x3 = _mm_loadu_si128((__m128i *)(ycp + 32));

		x0 = _mm_blend_epi16(x3, x1, MASK_INT);
		x6 = _mm_blend_epi16(x2, x3, MASK_INT);
		x7 = _mm_blend_epi16(x1, x2, MASK_INT);

		x0 = _mm_blend_epi16(x0, x2, MASK_INT<<1);
		x6 = _mm_blend_epi16(x6, x1, MASK_INT<<1);
		x7 = _mm_blend_epi16(x7, x3, MASK_INT<<1);

		x1 = xC_SUFFLE_YCP_Y;
		x0 = _mm_shuffle_epi8(x0, x1);
		x6 = _mm_shuffle_epi8(x6, _mm_alignr_epi8(x1, x1, 6));
		x7 = _mm_shuffle_epi8(x7, _mm_alignr_epi8(x1, x1, 12));

		xY = _mm_srli_epi16(x0, 8);
		xU = _mm_srli_epi16(x6, 8);
		xV = _mm_srli_epi16(x7, 8);

		x1 = _mm_loadu_si128((__m128i *)(ycp + 48));
		x2 = _mm_loadu_si128((__m128i *)(ycp + 64));
		x3 = _mm_loadu_si128((__m128i *)(ycp + 80));

		x0 = _mm_blend_epi16(x3, x1, MASK_INT);
		x6 = _mm_blend_epi16(x2, x3, MASK_INT);
		x7 = _mm_blend_epi16(x1, x2, MASK_INT);

		x0 = _mm_blend_epi16(x0, x2, MASK_INT<<1);
		x6 = _mm_blend_epi16(x6, x1, MASK_INT<<1);
		x7 = _mm_blend_epi16(x7, x3, MASK_INT<<1);

		x1 = xC_SUFFLE_YCP_Y;
		x0 = _mm_shuffle_epi8(x0, x1);
		x6 = _mm_shuffle_epi8(x6, _mm_alignr_epi8(x1, x1, 6));
		x7 = _mm_shuffle_epi8(x7, _mm_alignr_epi8(x1, x1, 12));

		x0 = _mm_srli_epi16(x0, 8);
		x6 = _mm_srli_epi16(x6, 8);
		x7 = _mm_srli_epi16(x7, 8);

		xY = _mm_packus_epi16(xY, x0);
		xU = _mm_packus_epi16(xU, x6);
		xV = _mm_packus_epi16(xV, x7);

		_mm_storeu_si128((__m128i*)dst_y, xY);
		_mm_storeu_si128((__m128i*)dst_u, xU);
		_mm_storeu_si128((__m128i*)dst_v, xV);
	}
}
void convert_lw48_to_yuv444_16bit_avx(void *pixel, CONVERT_CF_DATA *pixel_data, const int width, const int height) {
	__m128i x0, x1, x2, x3, x6, x7;
	const int MASK_INT = 0x40 + 0x08 + 0x01;
	USHORT *dst_y = (USHORT *)pixel_data->data[0];
	USHORT *dst_u = (USHORT *)pixel_data->data[1];
	USHORT *dst_v = (USHORT *)pixel_data->data[2];
	BYTE *ycp = (BYTE *)pixel;
	BYTE * const ycp_fin = ycp + width * height * 6;
	for ( ; ycp < ycp_fin; ycp += 48, dst_y += 8, dst_u += 8, dst_v += 8) {
		x1 = _mm_loadu_si128((__m128i *)(ycp +  0));
		x2 = _mm_loadu_si128((__m128i *)(ycp + 16));
		x3 = _mm_loadu_si128((__m128i *)(ycp + 32));

		x0 = _mm_blend_epi16(x3, x1, MASK_INT);
		x6 = _mm_blend_epi16(x2, x3, MASK_INT);
		x7 = _mm_blend_epi16(x1, x2, MASK_INT);

		x0 = _mm_blend_epi16(x0, x2, MASK_INT<<1);
		x6 = _mm_blend_epi16(x6, x1, MASK_INT<<1);
		x7 = _mm_blend_epi16(x7, x3, MASK_INT<<1);

		x1 = xC_SUFFLE_YCP_Y;
		x0 = _mm_shuffle_epi8(x0, x1);
		x6 = _mm_shuffle_epi8(x6, _mm_alignr_epi8(x1, x1, 6));
		x7 = _mm_shuffle_epi8(x7, _mm_alignr_epi8(x1, x1, 12));

		_mm_storeu_si128((__m128i*)dst_y, x0);
		_mm_storeu_si128((__m128i*)dst_u, x6);
		_mm_storeu_si128((__m128i*)dst_v, x7);
	}
}
#endif //(_MSC_VER >= 1600)

