glibc/ports/sysdeps/tile/tilegx/memcpy.c

207 lines
5.8 KiB
C

/* Copyright (C) 2011-2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <arch/chip.h>
/* Must be 8 bytes in size. */
#define word_t uint64_t
/* How many cache lines ahead should we prefetch? */
#define PREFETCH_LINES_AHEAD 3
void *
__memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
{
char *__restrict dst1 = (char *) dstv;
const char *__restrict src1 = (const char *) srcv;
const char *__restrict src1_end;
const char *__restrict prefetch;
word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
word_t final; /* Final bytes to write to trailing word, if any */
long i;
if (n < 16)
{
for (; n; n--)
*dst1++ = *src1++;
return dstv;
}
/* Locate the end of source memory we will copy. Don't prefetch
past this. */
src1_end = src1 + n - 1;
/* Prefetch ahead a few cache lines, but not past the end. */
prefetch = src1;
for (i = 0; i < PREFETCH_LINES_AHEAD; i++)
{
__insn_prefetch (prefetch);
prefetch += CHIP_L2_LINE_SIZE ();
prefetch = (prefetch > src1_end) ? prefetch : src1;
}
/* Copy bytes until dst is word-aligned. */
for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--)
*dst1++ = *src1++;
/* 8-byte pointer to destination memory. */
dst8 = (word_t *) dst1;
if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0))
{
/* Misaligned copy. Copy 8 bytes at a time, but don't bother
with other fanciness.
TODO: Consider prefetching and using wh64 as well. */
/* Create an aligned src8. */
const word_t *__restrict src8 =
(const word_t *) ((uintptr_t) src1 & -sizeof (word_t));
word_t b;
word_t a = *src8++;
for (; n >= sizeof (word_t); n -= sizeof (word_t))
{
b = *src8++;
a = __insn_dblalign (a, b, src1);
*dst8++ = a;
a = b;
}
if (n == 0)
return dstv;
b = ((const char *) src8 <= src1_end) ? *src8 : 0;
/* Final source bytes to write to trailing partial word, if any. */
final = __insn_dblalign (a, b, src1);
}
else
{
/* Aligned copy. */
const word_t *__restrict src8 = (const word_t *) src1;
/* src8 and dst8 are both word-aligned. */
if (n >= CHIP_L2_LINE_SIZE ())
{
/* Copy until 'dst' is cache-line-aligned. */
for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
n -= sizeof (word_t))
*dst8++ = *src8++;
/* If copying to self, return. The test is cheap enough
that we do it despite the fact that the memcpy() contract
doesn't require us to support overlapping dst and src.
This is the most common case of overlap, and any close
overlap will cause corruption due to the wh64 below.
This case is particularly important since the compiler
will emit memcpy() calls for aggregate copies even if it
can't prove that src != dst. */
if (__builtin_expect (dst8 == src8, 0))
return dstv;
for (; n >= CHIP_L2_LINE_SIZE ();)
{
__insn_wh64 (dst8);
/* Prefetch and advance to next line to prefetch, but
don't go past the end. */
__insn_prefetch (prefetch);
prefetch += CHIP_L2_LINE_SIZE ();
prefetch = (prefetch > src1_end) ? prefetch :
(const char *) src8;
/* Copy an entire cache line. Manually unrolled to
avoid idiosyncracies of compiler unrolling. */
#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; })
COPY_WORD (0);
COPY_WORD (1);
COPY_WORD (2);
COPY_WORD (3);
COPY_WORD (4);
COPY_WORD (5);
COPY_WORD (6);
COPY_WORD (7);
#if CHIP_L2_LINE_SIZE() != 64
# error "Fix code that assumes particular L2 cache line size."
#endif
dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
}
}
for (; n >= sizeof (word_t); n -= sizeof (word_t))
*dst8++ = *src8++;
if (__builtin_expect (n == 0, 1))
return dstv;
final = *src8;
}
/* n != 0 if we get here. Write out any trailing bytes. */
dst1 = (char *) dst8;
#ifndef __BIG_ENDIAN__
if (n & 4)
{
*(uint32_t *) dst1 = final;
dst1 += 4;
final >>= 32;
n &= 3;
}
if (n & 2)
{
*(uint16_t *) dst1 = final;
dst1 += 2;
final >>= 16;
n &= 1;
}
if (n)
*(uint8_t *) dst1 = final;
#else
if (n & 4)
{
*(uint32_t *) dst1 = final >> 32;
dst1 += 4;
}
else
{
final >>= 32;
}
if (n & 2)
{
*(uint16_t *) dst1 = final >> 16;
dst1 += 2;
}
else
{
final >>= 16;
}
if (n & 1)
*(uint8_t *) dst1 = final >> 8;
#endif
return dstv;
}
weak_alias (__memcpy, memcpy)
libc_hidden_builtin_def (memcpy)