10a11,32
> #include <limits.h>
> 
> int64_t get_difference(struct timeval *start_time)
> {
>         struct timeval new_time;
> 
> 	gettimeofday(&new_time, 0);
> 
> 	new_time.tv_usec -= start_time->tv_usec;
> 	new_time.tv_sec -= start_time->tv_sec;
> 	if(new_time.tv_usec < 0)
> 	{
> 		new_time.tv_usec += 1000000;
> 		new_time.tv_sec--;
> 	}
> 
> 	return (int64_t)new_time.tv_sec * 1000000 + 
> 		(int64_t)new_time.tv_usec;
> 
> }
> 
> 
32c54,55
< 	this->temp = 0;
---
> 	row_spans_h = 0;
> 	row_spans = 0;
38c61,66
< 	if(temp) delete temp;
---
> 	if (row_spans)
> 	{
> 		for (int i = 0; i < row_spans_h; i++) 
> 			free(row_spans[i]);
> 		delete row_spans;
> 	}
45,85d72
< #define OVERSAMPLE 8
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< 
< #define DRAW_LINE_CLAMPED(type, value) \
< { \
< 	type **rows = (type**)frame->get_rows(); \
<  \
< 	if(draw_y2 != draw_y1) \
< 	{ \
< 		float slope = ((float)draw_x2 - draw_x1) / ((float)draw_y2 - draw_y1); \
< 		int w = frame->get_w() - 1; \
< 		int h = frame->get_h(); \
<  \
< 		for(float y = draw_y1; y < draw_y2; y++) \
< 		{ \
< 			if(y >= 0 && y < h) \
< 			{ \
< 				int x = (int)((y - draw_y1) * slope + draw_x1); \
< 				int y_i = (int)y; \
< 				int x_i = CLIP(x, 0, w); \
<  \
< 				if(rows[y_i][x_i] == value) \
< 					rows[y_i][x_i] = 0; \
< 				else \
< 					rows[y_i][x_i] = value; \
< 			} \
< 		} \
< 	} \
< }
< 
88,93c75,82
< void MaskUnit::draw_line_clamped(VFrame *frame, 
< 	int &x1, 
< 	int &y1, 
< 	int x2, 
< 	int y2,
< 	unsigned char k)
---
> inline void MaskUnit::draw_line_clamped(
> 	int draw_x1, 
> 	int draw_y1, 
> 	int draw_x2, 
> 	int draw_y2,
> 	int w,
> 	int h,
> 	int hoffset)
96,100c85
< 	int draw_x1;
< 	int draw_y1;
< 	int draw_x2;
< 	int draw_y2;
< 	unsigned char value;
---
> 	if (draw_y1 == draw_y2) return; 
102,126c87,124
< 	if(y2 < y1)
< 	{
< 		draw_x1 = x2;
< 		draw_y1 = y2;
< 		draw_x2 = x1;
< 		draw_y2 = y1;
< 	}
< 	else
< 	{
< 		draw_x1 = x1;
< 		draw_y1 = y1;
< 		draw_x2 = x2;
< 		draw_y2 = y2;
< 	}
< 
< 	switch(frame->get_color_model())
< 	{
< 		case BC_A8:
< 			DRAW_LINE_CLAMPED(unsigned char, k);
< 			break;
< 		
< 		case BC_A16:
< 			DRAW_LINE_CLAMPED(uint16_t, k);
< 			break;
< 	}
---
> 	if(draw_y2 < draw_y1)
> 	{ /* change the order */
> 		int tmp;
> 		tmp = draw_x1;
> 		draw_x1 = draw_x2;
> 		draw_x2 = tmp;
> 		tmp = draw_y1;
> 		draw_y1 = draw_y2;
> 		draw_y2 = tmp;
> 	}
> 
> 	float slope = ((float)draw_x2 - draw_x1) / ((float)draw_y2 - draw_y1); 
> 	w--;
> 	for(int y_i = draw_y1; y_i < draw_y2; y_i++) 
> 	{ 
> 		if (y_i >= h) 
> 			return; // since y gets larger, there is no point in continuing
> 		else if(y_i >= 0) 
> 		{ 
> 			int x = (int)(slope * (y_i - draw_y1) + draw_x1); 
> 			int x_i = CLIP(x, 0, w); 
> 
> 			/* now insert into span in order */
> 			short index = 2;
> 			short *span = row_spans[y_i + hoffset];	
> 			while (index < span[0] && span[index] < x_i)
> 				index++;
> 			for (int j = span[0]; j > index; j--) {       // move forward
> 				span[j] = span[j-1];
> 			}
> 			span[index] = x_i;
> 			span[0] ++;
> 			if (span[0] > span[1]) { /* do the reallocation */
> 				span[1] *= 2;
> 				row_spans[y_i + hoffset] = (short *) realloc (span, span[1] * sizeof(short)); /* be careful! row_spans has to be updated! */
> 			};
> 		} 
> 	} 
381d378
< // Generated oversampling frame
383a381
> 		int mask_color_model = mask->get_color_model();
386d383
< //printf("MaskUnit::process_package 1\n");
388,402c385,400
< 		if(temp && 
< 			(temp->get_w() != oversampled_package_w ||
< 			temp->get_h() != oversampled_package_h))
< 		{
< 			delete temp;
< 			temp = 0;
< 		}
< //printf("MaskUnit::process_package 1\n");
< 
< 		if(!temp)
< 		{
< 			temp = new VFrame(0, 
< 				oversampled_package_w, 
< 				oversampled_package_h,
< 				BC_A8);
---
> 		row_spans_creation.lock(); // there was a race before...
> 		if (!row_spans || row_spans_h != mask_h * OVERSAMPLE) {
> 			int i;	
> 			if (row_spans) {   /* size change */
> 				for (i = 0; i < row_spans_h; i++) 
> 					free(row_spans[i]);
> 				delete row_spans;
> 			}
> 			row_spans_h = mask_h * OVERSAMPLE;
> 			row_spans = new (short *)[mask_h * OVERSAMPLE]; 
> 			for (i= 0; i<mask_h * OVERSAMPLE; i++) {
> 				/* we use malloc so we can use realloc */
> 				row_spans[i] = (short *)malloc(sizeof(short) * NUM_SPANS);
> 				/* [0] is initialized later */
> 				row_spans[i][1] = NUM_SPANS;
> 			}
404,405c402,403
< 
< 		temp->clear_frame();
---
> 		row_spans_creation.unlock();
> 		
406a405
> 		
407a407,411
> // Draw bezier curves onto span buffer
> 		int old_x, old_y;
> 		int start_x, start_y;
> //struct timeval start_time;
> //gettimeofday(&start_time, 0);
409d412
< // Draw oversampled region of polygons on temp
412,413c415,416
< 			int old_x, old_y;
< 			unsigned char max = k + 1;
---
> 			
> 			old_x = SHRT_MIN; // sentinel
416c419
< 			if(points->total < 3) continue;
---
> 			if(points->total < 2) continue;
417a421,423
> 			for (int i = ptr->row1 * OVERSAMPLE; i < ptr->row2 * OVERSAMPLE; i++) 
> 				row_spans[i][0] = 2; /* initialize to zero */ 
> 			(ptr->row1*OVERSAMPLE, ptr->row2*OVERSAMPLE); // init just my rows
436,456c442,443
< 				for(int j = 0; j <= segments; j++)
< 				{
< 					float t = (float)j / segments;
< 					float tpow2 = t * t;
< 					float tpow3 = t * t * t;
< 					float invt = 1 - t;
< 					float invtpow2 = invt * invt;
< 					float invtpow3 = invt * invt * invt;
< 
< 					x = (        invtpow3 * x0
< 						+ 3 * t     * invtpow2 * x1
< 						+ 3 * tpow2 * invt     * x2 
< 						+     tpow3            * x3);
< 					y = (        invtpow3 * y0 
< 						+ 3 * t     * invtpow2 * y1
< 						+ 3 * tpow2 * invt     * y2 
< 						+     tpow3            * y3);
< 
< 					y -= ptr->row1;
< 					x *= OVERSAMPLE;
< 					y *= OVERSAMPLE;
---
> 				// possible optimization here... since these coordinates are bounding box for curve
> 				// we can continue with next curve if they are out of our range
458,461c445,446
< 					if(j > 0)
< 					{
< 						draw_line_clamped(temp, old_x, old_y, (int)x, (int)y, max);
< 					}
---
> 				// forward differencing bezier curves implementation taken from GPL code at
> 				// http://cvs.sourceforge.net/viewcvs.py/guliverkli/guliverkli/src/subtitles/Rasterizer.cpp?rev=1.3
463,466d447
< 					old_x = (int)x;
< 					old_y = (int)y;
< 				}
< 			}
468d448
< //printf("MaskUnit::process_package 1\n");
469a450
> 				float cx3, cx2, cx1, cx0, cy3, cy2, cy1, cy0;
471a453,456
> 				// [-1 +3 -3 +1]
> 				// [+3 -6 +3  0]
> 				// [-3 +3  0  0]
> 				// [+1  0  0  0]
472a458,461
> 		 		cx3 = -  x0 + 3*x1 - 3*x2 + x3;
> 				cx2 =  3*x0 - 6*x1 + 3*x2;
> 				cx1 = -3*x0 + 3*x1;
> 				cx0 =    x0;
474,503c463,466
< #define FILL_ROWS(type) \
< for(int i = 0; i < oversampled_package_h; i++) \
< { \
< 	type *row = (type*)temp->get_rows()[i]; \
< 	int value = 0x0; \
< 	int total = 0; \
<  \
<  	for(int j = 0; j < oversampled_package_w; j++) \
< 		if(row[j] == max) total++; \
<  \
<  	if(total > 1) \
< 	{ \
< 		if(total & 0x1) total--; \
< 		for(int j = 0; j < oversampled_package_w; j++) \
< 		{ \
< 			if(row[j] == max && total > 0) \
< 			{ \
< 				if(value)  \
< 					value = 0x0; \
< 				else \
< 					value = max; \
< 				total--; \
< 			} \
< 			else \
< 			{ \
< 				if(value) row[j] = value; \
< 			} \
< 		} \
< 	} \
< }
---
> 				cy3 = -  y0 + 3*y1 - 3*y2 + y3;
> 				cy2 =  3*y0 - 6*y1 + 3*y2;
> 				cy1 = -3*y0 + 3*y1;
> 				cy0 =    y0;
504a468,469
> 				float maxaccel1 = fabs(2*cy2) + fabs(6*cy3);
> 				float maxaccel2 = fabs(2*cx2) + fabs(6*cx3);
506,511c471,472
< // Fill in the polygon in the horizontal direction
< 			switch(temp->get_color_model())
< 			{
< 				case BC_A8:
< 					FILL_ROWS(unsigned char);
< 					break;
---
> 				float maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
> 				float h = 1.0;
513,517c474
< 				case BC_A16:
< 					FILL_ROWS(uint16_t);
< 					break;
< 			}
< 		}
---
> 				if(maxaccel > 8.0) h = sqrt(8.0 / maxaccel);
518a476,479
> 				for(float t = 0.0; t < 1.0; t += h)
> 				{
> 					x = (cx0 + t*(cx1 + t*(cx2 + t*cx3))) * OVERSAMPLE;
> 					y = (cy0 + t*(cy1 + t*(cy2 + t*cy3)) - ptr->row1) * OVERSAMPLE;
519a481,490
> 					if (old_x != SHRT_MIN) 
> 						draw_line_clamped(old_x, old_y, (int)x, (int)y, oversampled_package_w, oversampled_package_h, ptr->row1 * OVERSAMPLE);
> 					else 
> 					{
> 						start_x = (int) x;
> 						start_y = (int) y;
> 					}
> 					old_x = (int)x;
> 					old_y = (int)y;
> 				}
520a492,500
> 				x = x3 * OVERSAMPLE;
> 				y = (y3 - ptr->row1) * OVERSAMPLE;
> 				draw_line_clamped(old_x, old_y, (int)x, (int)y, oversampled_package_w, oversampled_package_h, ptr->row1 * OVERSAMPLE);
> 				old_x = (int)x;
> 				old_y = (int)y;
> 		
> 			}
> //printf("MaskUnit::process_package 1\n");
> 			draw_line_clamped(old_x, old_y, start_x, start_y, oversampled_package_w, oversampled_package_h, ptr->row1 * OVERSAMPLE);
522a503,509
> 			// Now we have ordered spans ready!
> 			//printf("Segment : %i , row1: %i\n", oversampled_package_h, ptr->row1);
> 			uint16_t value;
> 			if (mask_color_model == BC_A8)
> 				value = (int)((float)engine->value / 100 * 0xff);
> 			else
> 				value = (int)((float)engine->value / 100 * 0xffff);
523a511,545
> 			/* Scaneline sampling, inspired by Graphics gems I, page 81 */
> 			for (int i = ptr->row1; i < ptr->row2; i++) 
> 			{
> 				short min_x = SHRT_MAX;
> 				short max_x = SHRT_MIN;
> 				int j; 				/* universal counter for 0..OVERSAMPLE-1 */
> 				short *span;			/* current span - set inside loops with j */
> 				short span_p[OVERSAMPLE];	/* pointers to current positions in spans */
> 				#define P (span_p[j])		/* current span pointer */
> 				#define MAXP (span[0])		/* current span length */
> 				int num_empty_spans = 0;
> 				/* get the initial span pointers ready */
> 				for (j = 0; j < OVERSAMPLE; j++)
> 				{	
> 					span = row_spans[j + i * OVERSAMPLE];
> 					P = 2;              /* starting pointers to spans */
> 						/* hypotetical hypotetical fix goes here: take care that there is maximum one empty span for every subpixel */ 
> 					if (MAXP != 2) {                                        /* if span is not empty */
> 						if (span[2] < min_x) min_x = span[2];           /* take start of the first span */
> 						if (span[MAXP-1] > max_x) max_x = span[MAXP-1]; /* and end of last */
> 					} else              
> 					{	/* span is empty */
> 						num_empty_spans ++;	
> 					}	
> 				}
> 				if (num_empty_spans == OVERSAMPLE)
> 					continue; /* no work for us here */
> 			
> 				/* we have some pixels to fill, do coverage calculation for span */
> 
> 				void *output_row = (unsigned char*)mask->get_rows()[i];
> 				min_x = min_x / OVERSAMPLE;
> 				max_x = (max_x + OVERSAMPLE - 1) / OVERSAMPLE;
> 				
> 				/* printf("row %i, pixel range: %i %i, spans0: %i\n", i, min_x, max_x, row_spans[i*OVERSAMPLE][0]-2); */
524a547,643
> 				/* this is not a full loop, since we jump trough h if possible */
> 				for (int h = min_x; h <= max_x; h++) 
> 				{
> 					short pixelleft = h * OVERSAMPLE;  /* leftmost subpixel of pixel*/
> 					short pixelright = pixelleft + OVERSAMPLE - 1; /* rightmost subpixel of pixel */
> 					uint32_t coverage = 0;
> 					int num_left = 0;               /* number of spans that have start left of the next pixel */
> 					short right_end = SHRT_MAX;     /* leftmost end of any span - right end of a full scanline */
> 					short right_start = SHRT_MAX;   /* leftmost start of any span - left end of empty scanline */
> 
> 					for (j=0; j< OVERSAMPLE; j++) 
> 					{	
> 						char chg = 1;
> 						span = row_spans[j + i * OVERSAMPLE];
> 						while (P < MAXP && chg)
> 						{
> 						//	printf("Sp: %i %i\n", span[P], span[P+1]);
> 							if (span[P] <= pixelright)          /* if span start is before the end of pixel */
> 								coverage += MIN(span[P+1], pixelright)  /* 'clip' the span to pixel */
> 		                                                          - MAX(span[P], pixelleft) + 1;
> 							if (span[P+1] <= pixelright) 
> 								P += 2;
> 							else 
> 								chg = 0;
> 						} 
> 						if (P == MAXP) 
> 							num_left = -OVERSAMPLE; /* just take care that num_left cannot equal OVERSAMPLE or zero again */
> 						else	
> 						{ 
> 							if (span[P] <= pixelright)  /* if span starts before subpixel in the pixel on the right */
> 							{    /* useful for determining filled space till next non-fully-filled pixel */
> 								num_left ++;						
> 								if (span[P+1] < right_end) right_end = span[P+1]; 
> 							} else 
> 							{    /* useful for determining empty space till next non-empty pixel */
> 								if (span[P] < right_start) right_start = span[P]; 
> 							}
> 						}
> 					}
> 					// calculate coverage
> 					coverage *= value;
> 					if(OVERSAMPLE == 8) coverage >>= 6; \
> 					else \
> 					if(OVERSAMPLE == 4) coverage >>= 2; \
> 					else \
> 					if(OVERSAMPLE == 2) coverage >>= 2; \
> 					else coverage /= OVERSAMPLE * OVERSAMPLE; \
> 
> 					
> 					if (mask_color_model == BC_A8) 
> 					{
> 						if (((unsigned char *) output_row)[h] < coverage) /* when we have multiple masks... we don't want aliasing inside areas */
> 							((unsigned char*)output_row)[h] = coverage;
> 					} else
> 					{
> 						if (((uint16_t *) output_row)[h] < coverage) /* when we have multiple masks... we don't want aliasing inside areas */
> 							((uint16_t *) output_row)[h] = coverage;
> 					}
> 					/* possible optimization: do joining of multiple masks by span logics, not by bitmap logics*/
> 					
> 					if (num_left == OVERSAMPLE) 
> 					{
> 						/* all current spans start more left than next pixel */
> 						/* this means we can probably (if lucky) draw a longer horizontal line */
> 						right_end = (right_end / OVERSAMPLE) - 1; /* last fully covered pixel */
> 						if (right_end > h)
> 						{
> 							if (mask_color_model == BC_A8) 
> 								memset((char *)output_row + h + 1, value, right_end - h);
> 							else {
> 								/* we are fucked, since there is no 16bit memset */
> 								for (int z = h +1; z < right_end; z++)
> 									((uint16_t *) output_row)[z] =  value;
> 		
> 							}
> 							h = right_end;  
> 						}
> 					} else 
> 					if (num_left == 0) 
> 					{
> 						/* all current spans start right of next pixel */ 
> 						/* this means we can probably (if lucky) skip some pixels */
> 						right_start = (right_start / OVERSAMPLE) - 1; /* last fully empty pixel */
> 						if (right_start > h)
> 						{
> 							h = right_start;
> 						}
> 					}
> 				}
> 				
> 			}					
> 			
> 		}
> //		int64_t dif= get_difference(&start_time);
> //		printf("diff: %lli\n", dif);
> 	}
> 	/* possible optimization: get miny and maxy of the new bitmap - and apply following filters there only */
526,561d644
< #define DOWNSAMPLE(type, value) \
< for(int i = 0; i < ptr->row2 - ptr->row1; i++) \
< { \
< 	type *output_row = (type*)mask->get_rows()[i + ptr->row1]; \
< 	unsigned char **input_rows = (unsigned char**)temp->get_rows() + i * OVERSAMPLE; \
<  \
<  \
< 	for(int j = 0; j < mask_w; j++) \
< 	{ \
< 		int64_t total = 0; \
<  \
< /* Accumulate pixel */ \
< 		for(int k = 0; k < OVERSAMPLE; k++) \
< 		{ \
< 			unsigned char *input_vector = input_rows[k] + j * OVERSAMPLE; \
< 			for(int l = 0; l < OVERSAMPLE; l++) \
< 			{ \
< 				total += (input_vector[l] ? value : 0); \
< 			} \
< 		} \
<  \
< /* Divide pixel */ \
< 		if(OVERSAMPLE == 8) \
< 			total >>= 6; \
< 		else \
< 		if(OVERSAMPLE == 4) \
< 			total >>= 2; \
< 		else \
< 		if(OVERSAMPLE == 2) \
< 			total >>= 2; \
< 		else \
< 			total /= OVERSAMPLE * OVERSAMPLE; \
<  \
< 		output_row[j] = total; \
< 	} \
< }
563a647,648
> 
> //
565c650
< 		switch(mask->get_color_model())
---
> /*		switch(mask->get_color_model())
586c671
< 
---
> */
633a719,721
> 	int chroma_offset = (max + 1) / 2; \
> 	for(int i = ptr->row1; i < ptr->row2; i++) \
> 	{ \
636d723
< 	int chroma_offset = (max + 1) / 2; \
657a745
> 	} \
661a750,752
> 	int chroma_offset = (max + 1) / 2; \
> 		for(int i = ptr->row1; i < ptr->row2; i++) \
> 		{ \
664d754
< 	int chroma_offset = (max + 1) / 2; \
666c756,757
< 	for(int j  = 0; j < mask_w; j++) \
---
>         if (components == 4) output_row += 3; \
> 	for(int j  = mask_w; j != 0;  j--) \
670c761
< 			output_row[j * 4 + 3] = output_row[j * 4 + 3] * mask_row[j] / max; \
---
> 			*output_row = *output_row * *mask_row / max; \
674c765
< 			output_row[j * 3] = output_row[j * 3] * mask_row[j] / max; \
---
> 			output_row[0] = output_row[3] * mask_row[0] / max; \
676,677c767,768
< 			output_row[j * 3 + 1] = output_row[j * 3 + 1] * mask_row[j] / max; \
< 			output_row[j * 3 + 2] = output_row[j * 3 + 2] * mask_row[j] / max; \
---
> 			output_row[1] = output_row[1] * mask_row[0] / max; \
> 			output_row[2] = output_row[2] * mask_row[0] / max; \
681,682c772,773
< 				output_row[j * 3 + 1] += chroma_offset * (max - mask_row[j]) / max; \
< 				output_row[j * 3 + 2] += chroma_offset * (max - mask_row[j]) / max; \
---
> 				output_row[1] += chroma_offset * (max - mask_row[0]) / max; \
> 				output_row[2] += chroma_offset * (max - mask_row[0]) / max; \
684a776,778
> 		output_row += components; \
> 		mask_row += 1;		 \
> 	} \
690d783
< 
692c785
< 		for(int i = ptr->row1; i < ptr->row2; i++)
---
> 	//	for(int i = ptr->row1; i < ptr->row2; i++)
748a842
> //printf("diff2: %lli\n", get_difference(&start_time));
758c852
<  : LoadServer(cpus, cpus * OVERSAMPLE * 2)
---
>  : LoadServer(cpus, cpus * 2)
11a12,14
> #define OVERSAMPLE 8
> #define NUM_SPANS 4 /* starting number of spans to be allocated for */
> 
40c43
< 	void draw_line_clamped(VFrame *frame, int &x1, int &y1, int x2, int y2, unsigned char value);
---
> 	void draw_line_clamped(int x1, int y1, int x2, int y2, int w, int h, int hoffset);
57c60,63
< 	VFrame *temp;
---
> 	short **row_spans;
> 	short row_spans_h;
> 	Mutex row_spans_creation;
> 
