From 17766404d510cf01a0c88db6a5f0eebbc009c133 Mon Sep 17 00:00:00 2001 From: "plind44@gmail.com" Date: Fri, 6 Dec 2013 16:23:49 +0000 Subject: [PATCH] MIPS: Faster memcpy. BUG= R=jkummerow@chromium.org, plind44@gmail.com Review URL: https://codereview.chromium.org/104353002 Patch from yuyin QQ . git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@18274 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/mips/assembler-mips.cc | 15 ++ src/mips/assembler-mips.h | 13 + src/mips/codegen-mips.cc | 424 +++++++++++++++++++++++++++++++ src/mips/constants-mips.h | 12 + src/mips/disasm-mips.cc | 3 + src/mips/macro-assembler-mips.cc | 21 ++ src/mips/macro-assembler-mips.h | 5 + src/platform-posix.cc | 9 + src/platform.h | 20 ++ src/v8utils.h | 21 ++ 10 files changed, 543 insertions(+) diff --git a/src/mips/assembler-mips.cc b/src/mips/assembler-mips.cc index 9aed3bd4aa..f551dd5e10 100644 --- a/src/mips/assembler-mips.cc +++ b/src/mips/assembler-mips.cc @@ -260,6 +260,12 @@ MemOperand::MemOperand(Register rm, int32_t offset) : Operand(rm) { } +MemOperand::MemOperand(Register rm, int32_t unit, int32_t multiplier, + OffsetAddend offset_addend) : Operand(rm) { + offset_ = unit * multiplier + offset_addend; +} + + // ----------------------------------------------------------------------------- // Specific instructions, constants, and masks. @@ -1623,6 +1629,15 @@ void Assembler::ext_(Register rt, Register rs, uint16_t pos, uint16_t size) { } +void Assembler::pref(int32_t hint, const MemOperand& rs) { + ASSERT(kArchVariant != kLoongson); + ASSERT(is_uint5(hint) && is_uint16(rs.offset_)); + Instr instr = PREF | (rs.rm().code() << kRsShift) | (hint << kRtShift) + | (rs.offset_); + emit(instr); +} + + //--------Coprocessor-instructions---------------- // Load, store, move. diff --git a/src/mips/assembler-mips.h b/src/mips/assembler-mips.h index d9ef46cd01..70f77eaeda 100644 --- a/src/mips/assembler-mips.h +++ b/src/mips/assembler-mips.h @@ -386,7 +386,15 @@ class Operand BASE_EMBEDDED { // Class MemOperand represents a memory operand in load and store instructions. class MemOperand : public Operand { public: + // Immediate value attached to offset. + enum OffsetAddend { + offset_minus_one = -1, + offset_zero = 0 + }; + explicit MemOperand(Register rn, int32_t offset = 0); + explicit MemOperand(Register rn, int32_t unit, int32_t multiplier, + OffsetAddend offset_addend = offset_zero); int32_t offset() const { return offset_; } bool OffsetIsInt16Encodable() const { @@ -711,6 +719,11 @@ class Assembler : public AssemblerBase { void swr(Register rd, const MemOperand& rs); + //----------------Prefetch-------------------- + + void pref(int32_t hint, const MemOperand& rs); + + //-------------Misc-instructions-------------- // Break / Trap instructions. diff --git a/src/mips/codegen-mips.cc b/src/mips/codegen-mips.cc index 2c756580e8..cd3a24fb16 100644 --- a/src/mips/codegen-mips.cc +++ b/src/mips/codegen-mips.cc @@ -110,6 +110,430 @@ UnaryMathFunction CreateExpFunction() { } +#if defined(V8_HOST_ARCH_MIPS) +OS::MemCopyUint8Function CreateMemCopyUint8Function( + OS::MemCopyUint8Function stub) { +#if defined(USE_SIMULATOR) + return stub; +#else + if (Serializer::enabled()) { + return stub; + } + + size_t actual_size; + byte* buffer = static_cast(OS::Allocate(3 * KB, &actual_size, true)); + if (buffer == NULL) return stub; + + // This code assumes that cache lines are 32 bytes and if the cache line is + // larger it will not work correctly. + MacroAssembler masm(NULL, buffer, static_cast(actual_size)); + + { + Label lastb, unaligned, aligned, chkw, + loop16w, chk1w, wordCopy_loop, skip_pref, lastbloop, + leave, ua_chk16w, ua_loop16w, ua_skip_pref, ua_chkw, + ua_chk1w, ua_wordCopy_loop, ua_smallCopy, ua_smallCopy_loop; + + // The size of each prefetch. + uint32_t pref_chunk = 32; + // The maximum size of a prefetch, it must not be less then pref_chunk. + // If the real size of a prefetch is greater then max_pref_size and + // the kPrefHintPrepareForStore hint is used, the code will not work + // correctly. + uint32_t max_pref_size = 128; + ASSERT(pref_chunk < max_pref_size); + + // pref_limit is set based on the fact that we never use an offset + // greater then 5 on a store pref and that a single pref can + // never be larger then max_pref_size. + uint32_t pref_limit = (5 * pref_chunk) + max_pref_size; + int32_t pref_hint_load = kPrefHintLoadStreamed; + int32_t pref_hint_store = kPrefHintPrepareForStore; + uint32_t loadstore_chunk = 4; + + // The initial prefetches may fetch bytes that are before the buffer being + // copied. Start copies with an offset of 4 so avoid this situation when + // using kPrefHintPrepareForStore. + ASSERT(pref_hint_store != kPrefHintPrepareForStore || + pref_chunk * 4 >= max_pref_size); + + // If the size is less than 8, go to lastb. Regardless of size, + // copy dst pointer to v0 for the retuen value. + __ slti(t2, a2, 2 * loadstore_chunk); + __ bne(t2, zero_reg, &lastb); + __ mov(v0, a0); // In delay slot. + + // If src and dst have different alignments, go to unaligned, if they + // have the same alignment (but are not actually aligned) do a partial + // load/store to make them aligned. If they are both already aligned + // we can start copying at aligned. + __ xor_(t8, a1, a0); + __ andi(t8, t8, loadstore_chunk - 1); // t8 is a0/a1 word-displacement. + __ bne(t8, zero_reg, &unaligned); + __ subu(a3, zero_reg, a0); // In delay slot. + + __ andi(a3, a3, loadstore_chunk - 1); // Copy a3 bytes to align a0/a1. + __ beq(a3, zero_reg, &aligned); // Already aligned. + __ subu(a2, a2, a3); // In delay slot. a2 is the remining bytes count. + + __ lwr(t8, MemOperand(a1)); + __ addu(a1, a1, a3); + __ swr(t8, MemOperand(a0)); + __ addu(a0, a0, a3); + + // Now dst/src are both aligned to (word) aligned addresses. Set a2 to + // count how many bytes we have to copy after all the 64 byte chunks are + // copied and a3 to the dst pointer after all the 64 byte chunks have been + // copied. We will loop, incrementing a0 and a1 until a0 equals a3. + __ bind(&aligned); + __ andi(t8, a2, 0x3f); + __ beq(a2, t8, &chkw); // Less than 64? + __ subu(a3, a2, t8); // In delay slot. + __ addu(a3, a0, a3); // Now a3 is the final dst after loop. + + // When in the loop we prefetch with kPrefHintPrepareForStore hint, + // in this case the a0+x should be past the "t0-32" address. This means: + // for x=128 the last "safe" a0 address is "t0-160". Alternatively, for + // x=64 the last "safe" a0 address is "t0-96". In the current version we + // will use "pref hint, 128(a0)", so "t0-160" is the limit. + if (pref_hint_store == kPrefHintPrepareForStore) { + __ addu(t0, a0, a2); // t0 is the "past the end" address. + __ Subu(t9, t0, pref_limit); // t9 is the "last safe pref" address. + } + + __ Pref(pref_hint_load, MemOperand(a1, 0 * pref_chunk)); + __ Pref(pref_hint_load, MemOperand(a1, 1 * pref_chunk)); + __ Pref(pref_hint_load, MemOperand(a1, 2 * pref_chunk)); + __ Pref(pref_hint_load, MemOperand(a1, 3 * pref_chunk)); + + if (pref_hint_store != kPrefHintPrepareForStore) { + __ Pref(pref_hint_store, MemOperand(a0, 1 * pref_chunk)); + __ Pref(pref_hint_store, MemOperand(a0, 2 * pref_chunk)); + __ Pref(pref_hint_store, MemOperand(a0, 3 * pref_chunk)); + } + __ bind(&loop16w); + __ lw(t0, MemOperand(a1)); + + if (pref_hint_store == kPrefHintPrepareForStore) { + __ sltu(v1, t9, a0); // If a0 > t9, don't use next prefetch. + __ Branch(USE_DELAY_SLOT, &skip_pref, gt, v1, Operand(zero_reg)); + } + __ lw(t1, MemOperand(a1, 1, loadstore_chunk)); // Maybe in delay slot. + + __ Pref(pref_hint_store, MemOperand(a0, 4 * pref_chunk)); + __ Pref(pref_hint_store, MemOperand(a0, 5 * pref_chunk)); + + __ bind(&skip_pref); + __ lw(t2, MemOperand(a1, 2, loadstore_chunk)); + __ lw(t3, MemOperand(a1, 3, loadstore_chunk)); + __ lw(t4, MemOperand(a1, 4, loadstore_chunk)); + __ lw(t5, MemOperand(a1, 5, loadstore_chunk)); + __ lw(t6, MemOperand(a1, 6, loadstore_chunk)); + __ lw(t7, MemOperand(a1, 7, loadstore_chunk)); + __ Pref(pref_hint_load, MemOperand(a1, 4 * pref_chunk)); + + __ sw(t0, MemOperand(a0)); + __ sw(t1, MemOperand(a0, 1, loadstore_chunk)); + __ sw(t2, MemOperand(a0, 2, loadstore_chunk)); + __ sw(t3, MemOperand(a0, 3, loadstore_chunk)); + __ sw(t4, MemOperand(a0, 4, loadstore_chunk)); + __ sw(t5, MemOperand(a0, 5, loadstore_chunk)); + __ sw(t6, MemOperand(a0, 6, loadstore_chunk)); + __ sw(t7, MemOperand(a0, 7, loadstore_chunk)); + + __ lw(t0, MemOperand(a1, 8, loadstore_chunk)); + __ lw(t1, MemOperand(a1, 9, loadstore_chunk)); + __ lw(t2, MemOperand(a1, 10, loadstore_chunk)); + __ lw(t3, MemOperand(a1, 11, loadstore_chunk)); + __ lw(t4, MemOperand(a1, 12, loadstore_chunk)); + __ lw(t5, MemOperand(a1, 13, loadstore_chunk)); + __ lw(t6, MemOperand(a1, 14, loadstore_chunk)); + __ lw(t7, MemOperand(a1, 15, loadstore_chunk)); + __ Pref(pref_hint_load, MemOperand(a1, 5 * pref_chunk)); + + __ sw(t0, MemOperand(a0, 8, loadstore_chunk)); + __ sw(t1, MemOperand(a0, 9, loadstore_chunk)); + __ sw(t2, MemOperand(a0, 10, loadstore_chunk)); + __ sw(t3, MemOperand(a0, 11, loadstore_chunk)); + __ sw(t4, MemOperand(a0, 12, loadstore_chunk)); + __ sw(t5, MemOperand(a0, 13, loadstore_chunk)); + __ sw(t6, MemOperand(a0, 14, loadstore_chunk)); + __ sw(t7, MemOperand(a0, 15, loadstore_chunk)); + __ addiu(a0, a0, 16 * loadstore_chunk); + __ bne(a0, a3, &loop16w); + __ addiu(a1, a1, 16 * loadstore_chunk); // In delay slot. + __ mov(a2, t8); + + // Here we have src and dest word-aligned but less than 64-bytes to go. + // Check for a 32 bytes chunk and copy if there is one. Otherwise jump + // down to chk1w to handle the tail end of the copy. + __ bind(&chkw); + __ Pref(pref_hint_load, MemOperand(a1, 0 * pref_chunk)); + __ andi(t8, a2, 0x1f); + __ beq(a2, t8, &chk1w); // Less than 32? + __ nop(); // In delay slot. + __ lw(t0, MemOperand(a1)); + __ lw(t1, MemOperand(a1, 1, loadstore_chunk)); + __ lw(t2, MemOperand(a1, 2, loadstore_chunk)); + __ lw(t3, MemOperand(a1, 3, loadstore_chunk)); + __ lw(t4, MemOperand(a1, 4, loadstore_chunk)); + __ lw(t5, MemOperand(a1, 5, loadstore_chunk)); + __ lw(t6, MemOperand(a1, 6, loadstore_chunk)); + __ lw(t7, MemOperand(a1, 7, loadstore_chunk)); + __ addiu(a1, a1, 8 * loadstore_chunk); + __ sw(t0, MemOperand(a0)); + __ sw(t1, MemOperand(a0, 1, loadstore_chunk)); + __ sw(t2, MemOperand(a0, 2, loadstore_chunk)); + __ sw(t3, MemOperand(a0, 3, loadstore_chunk)); + __ sw(t4, MemOperand(a0, 4, loadstore_chunk)); + __ sw(t5, MemOperand(a0, 5, loadstore_chunk)); + __ sw(t6, MemOperand(a0, 6, loadstore_chunk)); + __ sw(t7, MemOperand(a0, 7, loadstore_chunk)); + __ addiu(a0, a0, 8 * loadstore_chunk); + + // Here we have less than 32 bytes to copy. Set up for a loop to copy + // one word at a time. Set a2 to count how many bytes we have to copy + // after all the word chunks are copied and a3 to the dst pointer after + // all the word chunks have been copied. We will loop, incrementing a0 + // and a1 untill a0 equals a3. + __ bind(&chk1w); + __ andi(a2, t8, loadstore_chunk - 1); + __ beq(a2, t8, &lastb); + __ subu(a3, t8, a2); // In delay slot. + __ addu(a3, a0, a3); + + __ bind(&wordCopy_loop); + __ lw(t3, MemOperand(a1)); + __ addiu(a0, a0, loadstore_chunk); + __ addiu(a1, a1, loadstore_chunk); + __ bne(a0, a3, &wordCopy_loop); + __ sw(t3, MemOperand(a0, -1, loadstore_chunk)); // In delay slot. + + __ bind(&lastb); + __ Branch(&leave, le, a2, Operand(zero_reg)); + __ addu(a3, a0, a2); + + __ bind(&lastbloop); + __ lb(v1, MemOperand(a1)); + __ addiu(a0, a0, 1); + __ addiu(a1, a1, 1); + __ bne(a0, a3, &lastbloop); + __ sb(v1, MemOperand(a0, -1)); // In delay slot. + + __ bind(&leave); + __ jr(ra); + __ nop(); + + // Unaligned case. Only the dst gets aligned so we need to do partial + // loads of the source followed by normal stores to the dst (once we + // have aligned the destination). + __ bind(&unaligned); + __ andi(a3, a3, loadstore_chunk - 1); // Copy a3 bytes to align a0/a1. + __ beq(a3, zero_reg, &ua_chk16w); + __ subu(a2, a2, a3); // In delay slot. + + __ lwr(v1, MemOperand(a1)); + __ lwl(v1, + MemOperand(a1, 1, loadstore_chunk, MemOperand::offset_minus_one)); + __ addu(a1, a1, a3); + __ swr(v1, MemOperand(a0)); + __ addu(a0, a0, a3); + + // Now the dst (but not the source) is aligned. Set a2 to count how many + // bytes we have to copy after all the 64 byte chunks are copied and a3 to + // the dst pointer after all the 64 byte chunks have been copied. We will + // loop, incrementing a0 and a1 until a0 equals a3. + __ bind(&ua_chk16w); + __ andi(t8, a2, 0x3f); + __ beq(a2, t8, &ua_chkw); + __ subu(a3, a2, t8); // In delay slot. + __ addu(a3, a0, a3); + + if (pref_hint_store == kPrefHintPrepareForStore) { + __ addu(t0, a0, a2); + __ Subu(t9, t0, pref_limit); + } + + __ Pref(pref_hint_load, MemOperand(a1, 0 * pref_chunk)); + __ Pref(pref_hint_load, MemOperand(a1, 1 * pref_chunk)); + __ Pref(pref_hint_load, MemOperand(a1, 2 * pref_chunk)); + + if (pref_hint_store != kPrefHintPrepareForStore) { + __ Pref(pref_hint_store, MemOperand(a0, 1 * pref_chunk)); + __ Pref(pref_hint_store, MemOperand(a0, 2 * pref_chunk)); + __ Pref(pref_hint_store, MemOperand(a0, 3 * pref_chunk)); + } + + __ bind(&ua_loop16w); + __ Pref(pref_hint_load, MemOperand(a1, 3 * pref_chunk)); + __ lwr(t0, MemOperand(a1)); + __ lwr(t1, MemOperand(a1, 1, loadstore_chunk)); + __ lwr(t2, MemOperand(a1, 2, loadstore_chunk)); + + if (pref_hint_store == kPrefHintPrepareForStore) { + __ sltu(v1, t9, a0); + __ Branch(USE_DELAY_SLOT, &ua_skip_pref, gt, v1, Operand(zero_reg)); + } + __ lwr(t3, MemOperand(a1, 3, loadstore_chunk)); // Maybe in delay slot. + + __ Pref(pref_hint_store, MemOperand(a0, 4 * pref_chunk)); + __ Pref(pref_hint_store, MemOperand(a0, 5 * pref_chunk)); + + __ bind(&ua_skip_pref); + __ lwr(t4, MemOperand(a1, 4, loadstore_chunk)); + __ lwr(t5, MemOperand(a1, 5, loadstore_chunk)); + __ lwr(t6, MemOperand(a1, 6, loadstore_chunk)); + __ lwr(t7, MemOperand(a1, 7, loadstore_chunk)); + __ lwl(t0, + MemOperand(a1, 1, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t1, + MemOperand(a1, 2, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t2, + MemOperand(a1, 3, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t3, + MemOperand(a1, 4, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t4, + MemOperand(a1, 5, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t5, + MemOperand(a1, 6, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t6, + MemOperand(a1, 7, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t7, + MemOperand(a1, 8, loadstore_chunk, MemOperand::offset_minus_one)); + __ Pref(pref_hint_load, MemOperand(a1, 4 * pref_chunk)); + __ sw(t0, MemOperand(a0)); + __ sw(t1, MemOperand(a0, 1, loadstore_chunk)); + __ sw(t2, MemOperand(a0, 2, loadstore_chunk)); + __ sw(t3, MemOperand(a0, 3, loadstore_chunk)); + __ sw(t4, MemOperand(a0, 4, loadstore_chunk)); + __ sw(t5, MemOperand(a0, 5, loadstore_chunk)); + __ sw(t6, MemOperand(a0, 6, loadstore_chunk)); + __ sw(t7, MemOperand(a0, 7, loadstore_chunk)); + __ lwr(t0, MemOperand(a1, 8, loadstore_chunk)); + __ lwr(t1, MemOperand(a1, 9, loadstore_chunk)); + __ lwr(t2, MemOperand(a1, 10, loadstore_chunk)); + __ lwr(t3, MemOperand(a1, 11, loadstore_chunk)); + __ lwr(t4, MemOperand(a1, 12, loadstore_chunk)); + __ lwr(t5, MemOperand(a1, 13, loadstore_chunk)); + __ lwr(t6, MemOperand(a1, 14, loadstore_chunk)); + __ lwr(t7, MemOperand(a1, 15, loadstore_chunk)); + __ lwl(t0, + MemOperand(a1, 9, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t1, + MemOperand(a1, 10, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t2, + MemOperand(a1, 11, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t3, + MemOperand(a1, 12, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t4, + MemOperand(a1, 13, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t5, + MemOperand(a1, 14, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t6, + MemOperand(a1, 15, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t7, + MemOperand(a1, 16, loadstore_chunk, MemOperand::offset_minus_one)); + __ Pref(pref_hint_load, MemOperand(a1, 5 * pref_chunk)); + __ sw(t0, MemOperand(a0, 8, loadstore_chunk)); + __ sw(t1, MemOperand(a0, 9, loadstore_chunk)); + __ sw(t2, MemOperand(a0, 10, loadstore_chunk)); + __ sw(t3, MemOperand(a0, 11, loadstore_chunk)); + __ sw(t4, MemOperand(a0, 12, loadstore_chunk)); + __ sw(t5, MemOperand(a0, 13, loadstore_chunk)); + __ sw(t6, MemOperand(a0, 14, loadstore_chunk)); + __ sw(t7, MemOperand(a0, 15, loadstore_chunk)); + __ addiu(a0, a0, 16 * loadstore_chunk); + __ bne(a0, a3, &ua_loop16w); + __ addiu(a1, a1, 16 * loadstore_chunk); // In delay slot. + __ mov(a2, t8); + + // Here less than 64-bytes. Check for + // a 32 byte chunk and copy if there is one. Otherwise jump down to + // ua_chk1w to handle the tail end of the copy. + __ bind(&ua_chkw); + __ Pref(pref_hint_load, MemOperand(a1)); + __ andi(t8, a2, 0x1f); + + __ beq(a2, t8, &ua_chk1w); + __ nop(); // In delay slot. + __ lwr(t0, MemOperand(a1)); + __ lwr(t1, MemOperand(a1, 1, loadstore_chunk)); + __ lwr(t2, MemOperand(a1, 2, loadstore_chunk)); + __ lwr(t3, MemOperand(a1, 3, loadstore_chunk)); + __ lwr(t4, MemOperand(a1, 4, loadstore_chunk)); + __ lwr(t5, MemOperand(a1, 5, loadstore_chunk)); + __ lwr(t6, MemOperand(a1, 6, loadstore_chunk)); + __ lwr(t7, MemOperand(a1, 7, loadstore_chunk)); + __ lwl(t0, + MemOperand(a1, 1, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t1, + MemOperand(a1, 2, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t2, + MemOperand(a1, 3, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t3, + MemOperand(a1, 4, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t4, + MemOperand(a1, 5, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t5, + MemOperand(a1, 6, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t6, + MemOperand(a1, 7, loadstore_chunk, MemOperand::offset_minus_one)); + __ lwl(t7, + MemOperand(a1, 8, loadstore_chunk, MemOperand::offset_minus_one)); + __ addiu(a1, a1, 8 * loadstore_chunk); + __ sw(t0, MemOperand(a0)); + __ sw(t1, MemOperand(a0, 1, loadstore_chunk)); + __ sw(t2, MemOperand(a0, 2, loadstore_chunk)); + __ sw(t3, MemOperand(a0, 3, loadstore_chunk)); + __ sw(t4, MemOperand(a0, 4, loadstore_chunk)); + __ sw(t5, MemOperand(a0, 5, loadstore_chunk)); + __ sw(t6, MemOperand(a0, 6, loadstore_chunk)); + __ sw(t7, MemOperand(a0, 7, loadstore_chunk)); + __ addiu(a0, a0, 8 * loadstore_chunk); + + // Less than 32 bytes to copy. Set up for a loop to + // copy one word at a time. + __ bind(&ua_chk1w); + __ andi(a2, t8, loadstore_chunk - 1); + __ beq(a2, t8, &ua_smallCopy); + __ subu(a3, t8, a2); // In delay slot. + __ addu(a3, a0, a3); + + __ bind(&ua_wordCopy_loop); + __ lwr(v1, MemOperand(a1)); + __ lwl(v1, + MemOperand(a1, 1, loadstore_chunk, MemOperand::offset_minus_one)); + __ addiu(a0, a0, loadstore_chunk); + __ addiu(a1, a1, loadstore_chunk); + __ bne(a0, a3, &ua_wordCopy_loop); + __ sw(v1, MemOperand(a0, -1, loadstore_chunk)); // In delay slot. + + // Copy the last 8 bytes. + __ bind(&ua_smallCopy); + __ beq(a2, zero_reg, &leave); + __ addu(a3, a0, a2); // In delay slot. + + __ bind(&ua_smallCopy_loop); + __ lb(v1, MemOperand(a1)); + __ addiu(a0, a0, 1); + __ addiu(a1, a1, 1); + __ bne(a0, a3, &ua_smallCopy_loop); + __ sb(v1, MemOperand(a0, -1)); // In delay slot. + + __ jr(ra); + __ nop(); + } + CodeDesc desc; + masm.GetCode(&desc); + ASSERT(!RelocInfo::RequiresRelocation(desc)); + + CPU::FlushICache(buffer, actual_size); + OS::ProtectCode(buffer, actual_size); + return FUNCTION_CAST(buffer); +#endif +} +#endif + #undef __ diff --git a/src/mips/constants-mips.h b/src/mips/constants-mips.h index 5a0870fd21..dcf8b82db0 100644 --- a/src/mips/constants-mips.h +++ b/src/mips/constants-mips.h @@ -124,6 +124,16 @@ const uint32_t kFCSRFlagMask = const uint32_t kFCSRExceptionFlagMask = kFCSRFlagMask ^ kFCSRInexactFlagMask; +// 'pref' instruction hints +const int32_t kPrefHintLoad = 0; +const int32_t kPrefHintStore = 1; +const int32_t kPrefHintLoadStreamed = 4; +const int32_t kPrefHintStoreStreamed = 5; +const int32_t kPrefHintLoadRetained = 6; +const int32_t kPrefHintStoreRetained = 7; +const int32_t kPrefHintWritebackInvalidate = 25; +const int32_t kPrefHintPrepareForStore = 30; + // Helper functions for converting between register numbers and names. class Registers { public: @@ -297,6 +307,8 @@ enum Opcode { LWC1 = ((6 << 3) + 1) << kOpcodeShift, LDC1 = ((6 << 3) + 5) << kOpcodeShift, + PREF = ((6 << 3) + 3) << kOpcodeShift, + SWC1 = ((7 << 3) + 1) << kOpcodeShift, SDC1 = ((7 << 3) + 5) << kOpcodeShift, diff --git a/src/mips/disasm-mips.cc b/src/mips/disasm-mips.cc index 691df940f2..1ae0340351 100644 --- a/src/mips/disasm-mips.cc +++ b/src/mips/disasm-mips.cc @@ -899,6 +899,9 @@ void Decoder::DecodeTypeImmediate(Instruction* instr) { case LWR: Format(instr, "lwr 'rt, 'imm16s('rs)"); break; + case PREF: + Format(instr, "pref 'rt, 'imm16s('rs)"); + break; case SB: Format(instr, "sb 'rt, 'imm16s('rs)"); break; diff --git a/src/mips/macro-assembler-mips.cc b/src/mips/macro-assembler-mips.cc index f33e6fa063..bc981cbaac 100644 --- a/src/mips/macro-assembler-mips.cc +++ b/src/mips/macro-assembler-mips.cc @@ -789,8 +789,29 @@ void MacroAssembler::Ror(Register rd, Register rs, const Operand& rt) { } +void MacroAssembler::Pref(int32_t hint, const MemOperand& rs) { + if (kArchVariant == kLoongson) { + lw(zero_reg, rs); + } else { + pref(hint, rs); + } +} + + //------------Pseudo-instructions------------- +void MacroAssembler::Ulw(Register rd, const MemOperand& rs) { + lwr(rd, rs); + lwl(rd, MemOperand(rs.rm(), rs.offset() + 3)); +} + + +void MacroAssembler::Usw(Register rd, const MemOperand& rs) { + swr(rd, rs); + swl(rd, MemOperand(rs.rm(), rs.offset() + 3)); +} + + void MacroAssembler::li(Register dst, Handle value, LiFlags mode) { AllowDeferredHandleDereference smi_check; if (value->IsSmi()) { diff --git a/src/mips/macro-assembler-mips.h b/src/mips/macro-assembler-mips.h index 4e30c353e2..f1b82ed0e5 100644 --- a/src/mips/macro-assembler-mips.h +++ b/src/mips/macro-assembler-mips.h @@ -601,12 +601,17 @@ class MacroAssembler: public Assembler { #undef DEFINE_INSTRUCTION #undef DEFINE_INSTRUCTION2 + void Pref(int32_t hint, const MemOperand& rs); + // --------------------------------------------------------------------------- // Pseudo-instructions. void mov(Register rd, Register rt) { or_(rd, rt, zero_reg); } + void Ulw(Register rd, const MemOperand& rs); + void Usw(Register rd, const MemOperand& rs); + // Load int32 in the rd register. void li(Register rd, Operand j, LiFlags mode = OPTIMIZE_SIZE); inline void li(Register rd, int32_t j, LiFlags mode = OPTIMIZE_SIZE) { diff --git a/src/platform-posix.cc b/src/platform-posix.cc index 40d37c8a53..0070bdf6a7 100644 --- a/src/platform-posix.cc +++ b/src/platform-posix.cc @@ -503,6 +503,12 @@ OS::MemCopyUint8Function CreateMemCopyUint8Function( OS::MemCopyUint8Function stub); OS::MemCopyUint16Uint8Function CreateMemCopyUint16Uint8Function( OS::MemCopyUint16Uint8Function stub); + +#elif defined(V8_HOST_ARCH_MIPS) +OS::MemCopyUint8Function OS::memcopy_uint8_function = &OS::MemCopyUint8Wrapper; +// Defined in codegen-mips.cc. +OS::MemCopyUint8Function CreateMemCopyUint8Function( + OS::MemCopyUint8Function stub); #endif @@ -517,6 +523,9 @@ void OS::PostSetUp() { CreateMemCopyUint8Function(&OS::MemCopyUint8Wrapper); OS::memcopy_uint16_uint8_function = CreateMemCopyUint16Uint8Function(&OS::MemCopyUint16Uint8Wrapper); +#elif defined(V8_HOST_ARCH_MIPS) + OS::memcopy_uint8_function = + CreateMemCopyUint8Function(&OS::MemCopyUint8Wrapper); #endif init_fast_log_function(); // fast_exp is initialized lazily. diff --git a/src/platform.h b/src/platform.h index 6bc94e14d7..f94a5266fa 100644 --- a/src/platform.h +++ b/src/platform.h @@ -365,6 +365,26 @@ class OS { size_t size) { (*memcopy_uint16_uint8_function)(dest, src, size); } +#elif defined(V8_HOST_ARCH_MIPS) + typedef void (*MemCopyUint8Function)(uint8_t* dest, + const uint8_t* src, + size_t size); + static MemCopyUint8Function memcopy_uint8_function; + static void MemCopyUint8Wrapper(uint8_t* dest, + const uint8_t* src, + size_t chars) { + memcpy(dest, src, chars); + } + // For values < 16, the assembler function is slower than the inlined C code. + static const int kMinComplexMemCopy = 16; + static void MemCopy(void* dest, const void* src, size_t size) { + (*memcopy_uint8_function)(reinterpret_cast(dest), + reinterpret_cast(src), + size); + } + static void MemMove(void* dest, const void* src, size_t size) { + memmove(dest, src, size); + } #else // Copy memory area to disjoint memory area. static void MemCopy(void* dest, const void* src, size_t size) { diff --git a/src/v8utils.h b/src/v8utils.h index 02e57ebe72..058b153a7e 100644 --- a/src/v8utils.h +++ b/src/v8utils.h @@ -266,6 +266,9 @@ INLINE(static void CopyCharsUnsigned(sinkchar* dest, INLINE(void CopyCharsUnsigned(uint8_t* dest, const uint8_t* src, int chars)); INLINE(void CopyCharsUnsigned(uint16_t* dest, const uint8_t* src, int chars)); INLINE(void CopyCharsUnsigned(uint16_t* dest, const uint16_t* src, int chars)); +#elif defined(V8_HOST_ARCH_MIPS) +INLINE(void CopyCharsUnsigned(uint8_t* dest, const uint8_t* src, int chars)); +INLINE(void CopyCharsUnsigned(uint16_t* dest, const uint16_t* src, int chars)); #endif // Copy from ASCII/16bit chars to ASCII/16bit chars. @@ -421,6 +424,24 @@ void CopyCharsUnsigned(uint16_t* dest, const uint16_t* src, int chars) { break; } } + + +#elif defined(V8_HOST_ARCH_MIPS) +void CopyCharsUnsigned(uint8_t* dest, const uint8_t* src, int chars) { + if (chars < OS::kMinComplexMemCopy) { + memcpy(dest, src, chars); + } else { + OS::MemCopy(dest, src, chars); + } +} + +void CopyCharsUnsigned(uint16_t* dest, const uint16_t* src, int chars) { + if (chars < OS::kMinComplexMemCopy) { + memcpy(dest, src, chars * sizeof(*dest)); + } else { + OS::MemCopy(dest, src, chars * sizeof(*dest)); + } +} #endif