diff --git a/ANNOUNCE b/ANNOUNCE index 47842b400..d11f32ad5 100644 --- a/ANNOUNCE +++ b/ANNOUNCE @@ -76,8 +76,9 @@ Version 1.6.31beta02 [July 8, 2017] Version 1.6.31beta03 [July 9, 2017] Updated CMakeLists.txt to add INTEL_SSE and MIPS_MSA platforms. - Change "int" to "png_uint_32" in intel/filter_sse2.c to prevent + Changed "int" to "png_size_t" in intel/filter_sse2.c to prevent possible integer overflow (Bug report by John Bowler). + Quieted "declaration after statement" warnings in intel/filter_sse2.c. Send comments/corrections/commendations to png-mng-implement at lists.sf.net (subscription required; visit diff --git a/CHANGES b/CHANGES index e7834f9de..5670e1d36 100644 --- a/CHANGES +++ b/CHANGES @@ -5871,8 +5871,9 @@ Version 1.6.31beta02 [July 8, 2017] Version 1.6.31beta03 [July 9, 2017] Updated CMakeLists.txt to add INTEL_SSE and MIPS_MSA platforms. - Change "int" to "png_uint_32" in intel/filter_sse2.c to prevent + Changed "int" to "png_size_t" in intel/filter_sse2.c to prevent possible integer overflow (Bug report by John Bowler). + Quieted "declaration after statement" warnings in intel/filter_sse2.c. Send comments/corrections/commendations to png-mng-implement at lists.sf.net (subscription required; visit diff --git a/intel/filter_sse2_intrinsics.c b/intel/filter_sse2_intrinsics.c index 86c4ea431..4f12d0a88 100644 --- a/intel/filter_sse2_intrinsics.c +++ b/intel/filter_sse2_intrinsics.c @@ -53,12 +53,15 @@ static void store3(void* p, __m128i v) { * its bottom two bytes, then its third byte. */ png_uint_32 v012; + png_uint_16* p01; + png_byte* p2; + store4(&v012, v); - png_uint_16* p01 = p; - png_byte* p2 = (png_byte*)(p01+1); - *p01 = v012; - *p2 = v012 >> 16; + p01 = p; + p2 = (png_byte*)(p01+1); + *p01 = (png_uint_16)v012; + *p2 = (png_byte)(v012 >> 16); } void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, @@ -68,10 +71,13 @@ void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, * There is no pixel to the left of the first pixel. It's encoded directly. * That works with our main loop if we just say that left pixel was zero. */ - png_debug(1, "in png_read_filter_row_sub3_sse2"); + png_size_t rb; + __m128i a, d = _mm_setzero_si128(); - png_uint_32 rb = row_info->rowbytes; + png_debug(1, "in png_read_filter_row_sub3_sse2"); + + rb = row_info->rowbytes; while (rb >= 4) { a = d; d = load4(row); d = _mm_add_epi8(d, a); @@ -88,6 +94,7 @@ void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, row += 3; rb -= 3; } + PNG_UNUSED(prev) } void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, @@ -97,10 +104,13 @@ void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, * There is no pixel to the left of the first pixel. It's encoded directly. * That works with our main loop if we just say that left pixel was zero. */ - png_debug(1, "in png_read_filter_row_sub4_sse2"); + png_size_t rb; + __m128i a, d = _mm_setzero_si128(); - png_uint_32 rb = row_info->rowbytes+4; + png_debug(1, "in png_read_filter_row_sub4_sse2"); + + rb = row_info->rowbytes+4; while (rb > 4) { a = d; d = load4(row); d = _mm_add_epi8(d, a); @@ -109,6 +119,7 @@ void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, row += 4; rb -= 4; } + PNG_UNUSED(prev) } void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, @@ -119,18 +130,23 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, * predicted to be half of the pixel above it. So again, this works * perfectly with our loop if we make sure a starts at zero. */ - png_debug(1, "in png_read_filter_row_avg3_sse2"); + + png_size_t rb; + const __m128i zero = _mm_setzero_si128(); + __m128i b; __m128i a, d = zero; - png_uint_32 rb = row_info->rowbytes; + png_debug(1, "in png_read_filter_row_avg3_sse2"); + rb = row_info->rowbytes; while (rb >= 4) { + __m128i avg; b = load4(prev); a = d; d = load4(row ); /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ - __m128i avg = _mm_avg_epu8(a,b); + avg = _mm_avg_epu8(a,b); /* ...but we can fix it up by subtracting off 1 if it rounded up. */ avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); @@ -142,11 +158,12 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, rb -= 3; } if (rb > 0) { + __m128i avg; b = load3(prev); a = d; d = load3(row ); /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ - __m128i avg = _mm_avg_epu8(a,b); + avg = _mm_avg_epu8(a,b); /* ...but we can fix it up by subtracting off 1 if it rounded up. */ avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); @@ -168,18 +185,21 @@ void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row, * predicted to be half of the pixel above it. So again, this works * perfectly with our loop if we make sure a starts at zero. */ - png_debug(1, "in png_read_filter_row_avg4_sse2"); + png_size_t rb; const __m128i zero = _mm_setzero_si128(); __m128i b; __m128i a, d = zero; - png_uint_32 rb = row_info->rowbytes+4; + png_debug(1, "in png_read_filter_row_avg4_sse2"); + + rb = row_info->rowbytes+4; while (rb > 4) { + __m128i avg; b = load4(prev); a = d; d = load4(row ); /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ - __m128i avg = _mm_avg_epu8(a,b); + avg = _mm_avg_epu8(a,b); /* ...but we can fix it up by subtracting off 1 if it rounded up. */ avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); @@ -237,38 +257,42 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, * Here we zero b and d, which become c and a respectively at the start of * the loop. */ - png_debug(1, "in png_read_filter_row_paeth3_sse2"); + png_size_t rb; const __m128i zero = _mm_setzero_si128(); __m128i c, b = zero, a, d = zero; - png_uint_32 rb = row_info->rowbytes; + png_debug(1, "in png_read_filter_row_paeth3_sse2"); + + rb = row_info->rowbytes; while (rb >= 4) { /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. */ + __m128i pa,pb,pc,smallest,nearest; c = b; b = _mm_unpacklo_epi8(load4(prev), zero); a = d; d = _mm_unpacklo_epi8(load4(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ - __m128i pa = _mm_sub_epi16(b,c); + + pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ - __m128i pb = _mm_sub_epi16(a,c); + pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ - __m128i pc = _mm_add_epi16(pa,pb); + pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ - __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); + smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ - __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, - if_then_else(_mm_cmpeq_epi16(smallest, pb), b, - c)); + nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, + if_then_else(_mm_cmpeq_epi16(smallest, pb), b, + c)); /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); @@ -282,26 +306,27 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. */ + __m128i pa,pb,pc,smallest,nearest; c = b; b = _mm_unpacklo_epi8(load3(prev), zero); a = d; d = _mm_unpacklo_epi8(load3(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ - __m128i pa = _mm_sub_epi16(b,c); + pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ - __m128i pb = _mm_sub_epi16(a,c); + pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ - __m128i pc = _mm_add_epi16(pa,pb); + pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ - __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); + smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ - __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, + nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); @@ -331,12 +356,15 @@ void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, * Here we zero b and d, which become c and a respectively at the start of * the loop. */ - png_debug(1, "in png_read_filter_row_paeth4_sse2"); + png_size_t rb; const __m128i zero = _mm_setzero_si128(); + __m128i pa,pb,pc,smallest,nearest; __m128i c, b = zero, a, d = zero; - png_uint_32 rb = row_info->rowbytes+4; + png_debug(1, "in png_read_filter_row_paeth4_sse2"); + + rb = row_info->rowbytes+4; while (rb > 4) { /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. @@ -345,22 +373,22 @@ void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, a = d; d = _mm_unpacklo_epi8(load4(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ - __m128i pa = _mm_sub_epi16(b,c); + pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ - __m128i pb = _mm_sub_epi16(a,c); + pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ - __m128i pc = _mm_add_epi16(pa,pb); + pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ - __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); + smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ - __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, + nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c));