/*
 *  yuv420-rgb555.c
 *
 *     Copyright (C) Erik Walthinsen - April 2000
 *
 *  This is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your
 *  option) any later version.
 *
 *  This software is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */

/* This is the first pass of a routine that does the conversion of a 16x16
 * region of luma, and two 8x8 regions of chroma, into a 16x16 region of
 * RGB 5:5:5 pixels.  It still needs some work.
 */

#include <glib.h>

static guint64 luma_impact = 0x4a7f4a7f4a7f4a7fL;    // 19071
static guint64 luma_impact_16 = 0x0254025402540254L; // 596
static guint64 green_impact = 0x0c831a050c831a05L;     // 3203, 6661
static guint64 blue_impact = 0x3fdf3fdf3fdf3fdfL;      // 16351
static guint64 red_impact = 0x3312331233123312L;       // 13074

static guint64 upperbyte = 0xff00ff00ff00ff00L;
static guint64 lowerbyte = 0x00ff00ff00ff00ffL;
static guint64 val128w = 0x0080008000800080L;
static guint64 shift_left_8 = 0x0100010001000100L;
static guint64 mask_byte_5_bits = 0x00f800f800f800f8L;
static guint64 red_blue_pack = 0x0400000104000001L;
static guint64 blue_red_pack = 0x0001040000010400L;

static inline void calculate_pixels(guint8 *y_plane,guint32 y_stride,
                                    guint8 *rgb_plane,guint32 rgb_stride);

/* YUV 4:2:0 conversion to RGB 05:5:5, 16x16 block
 * 
 * This routine will take three planes (y_plane, u_plane, v_plane) containing
 * YUV pixel data, and convert into the output plane.
 * A total of 256 pixels are converted, in a 16x16 block.  8x8 chroma samples
 * are used.
void yuv420_to_rgb555_16x16(guint8 *y_plane,guint32 y_stride,guint8 *u_plane,guint32 u_stride,guint8
*v_plane,guint32 v_stride,guint8 *rgb_plane,guint32 rgb_stride) {
  int y;

  for (y=0;y<8;y++) {
    /* start with leftmost chroma samples */
/***** Calculate 4 RGB chroma samples' impact for 16 pixels total *****/
	movq_m2r(*v_plane,mm1);			// 2
	pandn_r2r(mm7,mm7);			// 3
	punpckhbw_r2r(mm7,mm1);			// 5
	psubw_m2r(val128w,mm1);			// 7
	movq_r2r(mm1,mm3);			// 8
	movq_m2r(*u_plane,mm0);			// 1
	punpckhbw_r2r(mm7,mm0);			// 4
	movq_r2r(mm1,mm2);			// 12
	psubw_m2r(val128w,mm0);			// 6
	punpckhwd_r2r(mm0,mm3);			// 9
	punpcklwd_r2r(mm0,mm2);			// 13
	pmaddwd_m2r(green_impact,mm3);		// 10
	psrad_i2r(8,mm3);			// 11
	pmaddwd_m2r(green_impact,mm2);		// 14
	psrad_i2r(8,mm2);			// 15
	packssdw_r2r(mm3,mm2);			// 16

	pmullw_m2r(shift_left_8,mm0);
	pmullw_m2r(shift_left_8,mm1);
	pmulhw_m2r(red_impact,mm0);
	pmulhw_m2r(blue_impact,mm1);

    calculate_pixels(y_plane,y_stride,rgb_plane,rgb_stride);

    /* update pointers */
    y_plane += 8;
    rgb_plane += 8;

    /* now do the rightmost chroma samples */
/***** Calculate 4 RGB chroma samples' impact for 16 pixels total *****/
	movq_m2r(*v_plane,mm1);			// 2
	pandn_r2r(mm7,mm7);			// 3
	punpcklbw_r2r(mm7,mm1);			// 5
	psubw_m2r(val128w,mm1);			// 7
	movq_r2r(mm1,mm3);			// 8
	movq_m2r(*u_plane,mm0);			// 1
	punpcklbw_r2r(mm7,mm0);			// 4
	movq_r2r(mm1,mm2);			// 12
	psubw_m2r(val128w,mm0);			// 6
	punpckhwd_r2r(mm0,mm3);			// 9
	punpcklwd_r2r(mm0,mm2);			// 13
	pmaddwd_m2r(green_impact,mm3);		// 10
	psrad_i2r(8,mm3);			// 11
	pmaddwd_m2r(green_impact,mm2);		// 14
	psrad_i2r(8,mm2);			// 15
	packssdw_r2r(mm3,mm2);			// 16

	pmullw_m2r(shift_left_8,mm0);
	pmullw_m2r(shift_left_8,mm1);
	pmulhw_m2r(red_impact,mm0);
	pmulhw_m2r(blue_impact,mm1);

    calculate_pixels(y_plane,y_stride,rgb_plane,rgb_stride);

    /* update pointers */
    u_plane += u_stride - 8;
    v_plane += v_stride - 8;
    y_plane += y_stride - 16;
    rgb_plane += rgb_stride - 32;
  }
}

static inline void calculate_pixels(guint8 *y_plane,guint32 y_stride,
                                    guint8 *rgb_plane,guint32 rgb_stride) {
/***** Do the first group of four pixels *****/
// load 8 luma pixels
	movq_m2r(*y_plane,mm3);

// calculate luma impact for even pixels
	movq_r2r(mm3,mm4);			// 1
	pand_m2r(upperbyte,mm4);		// 2
	psrld_i2r(1,mm4);			// 3
	pmulhw_m2r(luma_impact,mm4);		// 4
	psubw_m2r(luma_impact_16,mm4);		// 5

	movq_r2r(mm0,mm5);			// 6
	movq_r2r(mm1,mm6);			// 7
	paddw_r2r(mm4,mm5);			// 8
	paddw_r2r(mm4,mm6);			// 9
	psraw_i2r(5,mm5);			// 10
	psraw_i2r(5,mm6);			// 11
	packuswb_r2r(mm7,mm5);			// 12
	punpcklbw_r2r(mm7,mm5);			// 13
	packuswb_r2r(mm7,mm6);			// 14
	punpcklbw_r2r(mm7,mm6);			// 15

// interleave red&blue components and merge into pixel ranges
	movq_r2r(mm6,mm7);
	punpcklwd_r2r(mm5,mm6);
	punpckhwd_r2r(mm7,mm5);
	pand_m2r(mask_byte_5_bits,mm5);
	pand_m2r(mask_byte_5_bits,mm6);
	pmaddwd_m2r(red_blue_pack,mm5);
	pmaddwd_m2r(blue_red_pack,mm6);
	psrld_i2r(3,mm5);
	psrld_i2r(3,mm6);
	packssdw_r2r(mm5,mm6);

// REGISTER usage at this point:
// mm0 - blue impact		KEEP - need for all 4 luma sets
// mm1 - red impact		KEEP - ditto
// mm2 - green impact		KEEP - ditto
// mm3 - 8 luma samples		KEEP - need for next luma set
// mm4 - even luma impact
// mm5 - 0,2 semipacked pixels	KEEP - to join green into
// mm6 - 4,6 semipacked pixels	KEEP - to join green into
// mm7 - red components

// calculate green impact
	pandn_r2r(mm7,mm7);
	psubw_r2r(mm2,mm4);
	psraw_i2r(5,mm4);
	packuswb_r2r(mm7,mm4);
	punpcklbw_r2r(mm7,mm4);
	pand_m2r(mask_byte_5_bits,mm4);
	psllw_i2r(2,mm4);
	por_r2r(mm4,mm6);


/***** Do the second group of four pixels *****/
// NOTE: at this point mm6 is untouchable, since it has even pixels
// calculate luma impact for odd pixels
	pand_m2r(lowerbyte,mm3);			// 1
	pmullw_m2r(val128w,mm3);			// 2
	pmulhw_m2r(luma_impact,mm3);			// 3
	psubw_m2r(luma_impact_16,mm3);			// 4

// join red&blue chroma to luma for odd 4 pixels
	movq_r2r(mm0,mm4);				// 5
	movq_r2r(mm1,mm5);				// 6
	paddw_r2r(mm3,mm4);				// 7
	paddw_r2r(mm3,mm5);				// 8
	psraw_i2r(5,mm4);				// 9
	psraw_i2r(5,mm5);				// 10
	packuswb_r2r(mm7,mm4);				// 11
	punpcklbw_r2r(mm7,mm4);				// 12
	packuswb_r2r(mm7,mm5);				// 13
	punpcklbw_r2r(mm7,mm5);				// 14

// interleave red&blue components and merge into pixel ranges
	movq_r2r(mm5,mm7);				// 15
	punpcklwd_r2r(mm4,mm5);				// 16
	punpckhwd_r2r(mm7,mm4);				// 17
	pand_m2r(mask_byte_5_bits,mm4);			// 18
	pand_m2r(mask_byte_5_bits,mm5);			// 19
	pmaddwd_m2r(red_blue_pack,mm4);			// 20
	pmaddwd_m2r(blue_red_pack,mm5);			// 21
	psrld_i2r(3,mm4);				// 22
	psrld_i2r(3,mm5);				// 23
	packssdw_r2r(mm4,mm5);				// 24

// merge in green components
	pandn_r2r(mm7,mm7);				// 25
	psubw_r2r(mm2,mm3);				// 26
	psraw_i2r(5,mm3);				// 27
	packuswb_r2r(mm7,mm3);				// 28
	punpcklbw_r2r(mm7,mm3);				// 29
	pand_m2r(mask_byte_5_bits,mm3);			// 30
	psllw_i2r(2,mm3);				// 31
	por_r2r(mm3,mm5);				// 32


// interleave 0,2,4,6 and 1,3,5,7 pixels
	movq_r2r(mm5,mm7);
	punpcklwd_r2r(mm6,mm7);
	punpckhwd_r2r(mm6,mm5);
// write out pixels
	movq_r2m(mm7,*rgb_plane);
	movq_r2m(mm5,*(rgb_plane+8));


/***** Do the third group of four pixels *****/
// load 8 luma pixels
	movq_m2r(*(y_plane+y_stride),mm3);

// calculate luma impact for even pixels
	movq_r2r(mm3,mm4);
	pand_m2r(upperbyte,mm4);
	psrld_i2r(1,mm4);
	pmulhw_m2r(luma_impact,mm4);
	psubw_m2r(luma_impact_16,mm4);

// join red&blue chroma to luma for even 4 pixels
	movq_r2r(mm0,mm5);
	movq_r2r(mm1,mm6);
	paddw_r2r(mm4,mm5);
	paddw_r2r(mm4,mm6);
	psraw_i2r(5,mm5);
	psraw_i2r(5,mm6);
	packuswb_r2r(mm7,mm5);
	punpcklbw_r2r(mm7,mm5);
	packuswb_r2r(mm7,mm6);
	punpcklbw_r2r(mm7,mm6);

// interleave red&blue components and merge into pixel ranges
	movq_r2r(mm6,mm7);
	punpcklwd_r2r(mm5,mm6);
	punpckhwd_r2r(mm7,mm5);
	pand_m2r(mask_byte_5_bits,mm5);
	pand_m2r(mask_byte_5_bits,mm6);
	pmaddwd_m2r(red_blue_pack,mm5);
	pmaddwd_m2r(blue_red_pack,mm6);
	psrld_i2r(3,mm5);
	psrld_i2r(3,mm6);
	packssdw_r2r(mm5,mm6);

// REGISTER usage at this point:
// mm0 - blue impact		KEEP - need for all 4 luma sets
// mm1 - red impact		KEEP - ditto
// mm2 - green impact		KEEP - ditto
// mm3 - 8 luma samples		KEEP - need for next luma set
// mm4 - even luma impact
// mm5 - 0,2 semipacked pixels	KEEP - to join green into
// mm6 - 4,6 semipacked pixels	KEEP - to join green into
// mm7 - red components

// calculate green impact
	pandn_r2r(mm7,mm7);
	psubw_r2r(mm2,mm4);
	psraw_i2r(5,mm4);
	packuswb_r2r(mm7,mm4);
	punpcklbw_r2r(mm7,mm4);
	pand_m2r(mask_byte_5_bits,mm4);
	psllw_i2r(2,mm4);
	por_r2r(mm4,mm6);


/***** Do the fourth group of four pixels *****/
// NOTE: at this point mm6 is untouchable, since it has even pixels
// calculate luma impact for odd pixels
	pand_m2r(lowerbyte,mm3);			// 1
	pmullw_m2r(val128w,mm3);			// 2
	pmulhw_m2r(luma_impact,mm3);			// 3
	psubw_m2r(luma_impact_16,mm3);			// 4

// join red&blue chroma to luma for odd 4 pixels
	movq_r2r(mm0,mm4);				// 5
	movq_r2r(mm1,mm5);				// 6
	paddw_r2r(mm3,mm4);				// 7
	paddw_r2r(mm3,mm5);				// 8
	psraw_i2r(5,mm4);				// 9
	psraw_i2r(5,mm5);				// 10
	packuswb_r2r(mm7,mm4);				// 11
	punpcklbw_r2r(mm7,mm4);				// 12
	packuswb_r2r(mm7,mm5);				// 13
	punpcklbw_r2r(mm7,mm5);				// 14

// interleave red&blue components and merge into pixel ranges
	movq_r2r(mm5,mm7);				// 15
	punpcklwd_r2r(mm4,mm5);				// 16
	punpckhwd_r2r(mm7,mm4);				// 17
	pand_m2r(mask_byte_5_bits,mm4);			// 18
	pand_m2r(mask_byte_5_bits,mm5);			// 19
	pmaddwd_m2r(red_blue_pack,mm4);			// 20
	pmaddwd_m2r(blue_red_pack,mm5);			// 21
	psrld_i2r(3,mm4);				// 22
	psrld_i2r(3,mm5);				// 23
	packssdw_r2r(mm4,mm5);				// 24

// merge in green components
	pandn_r2r(mm7,mm7);				// 25
	psubw_r2r(mm2,mm3);				// 26
	psraw_i2r(5,mm3);				// 27
	packuswb_r2r(mm7,mm3);				// 28
	punpcklbw_r2r(mm7,mm3);				// 29
	pand_m2r(mask_byte_5_bits,mm3);			// 30
	psllw_i2r(2,mm3);				// 31
	por_r2r(mm3,mm5);				// 32


// interleave 0,2,4,6 and 1,3,5,7 pixels
	movq_r2r(mm5,mm7);
	punpcklwd_r2r(mm6,mm7);
	punpckhwd_r2r(mm6,mm5);
// write out pixels
	movq_r2m(mm7,*(rgb_plane+rgb_stride));
	movq_r2m(mm5,*(rgb_plane+rgb_stride+7));
}