Improve queuing for concurrent OSR.

Specifically, this includes:
- Encapsulating data structure for osr buffer into CyclicBuffer
- Use the new CyclicQueue instead of UnboundedQueue to queue new jobs.
  We can enqueue and dequeue a CyclicQueue on both ends in O(1).
  This allows us to add OSR jobs to the front for lower compile latency.
- Dispose osr buffer by one stale job per GC to avoid leak

R=titzer@chromium.org
BUG=

Review URL: https://codereview.chromium.org/25505002

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@17244 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
yangguo@chromium.org 2013-10-16 14:47:20 +00:00
parent 0a6d2dec9f
commit b8dd056fa2
3 changed files with 113 additions and 80 deletions

View File

@ -450,6 +450,10 @@ void Heap::GarbageCollectionPrologue() {
#endif // DEBUG #endif // DEBUG
store_buffer()->GCPrologue(); store_buffer()->GCPrologue();
if (FLAG_concurrent_osr) {
isolate()->optimizing_compiler_thread()->AgeBufferedOsrJobs();
}
} }

View File

@ -93,12 +93,20 @@ void OptimizingCompilerThread::Run() {
} }
RecompileJob* OptimizingCompilerThread::NextInput() {
LockGuard<Mutex> access_input_queue_(&input_queue_mutex_);
if (input_queue_length_ == 0) return NULL;
RecompileJob* job = input_queue_[InputQueueIndex(0)];
ASSERT_NE(NULL, job);
input_queue_shift_ = InputQueueIndex(1);
input_queue_length_--;
return job;
}
void OptimizingCompilerThread::CompileNext() { void OptimizingCompilerThread::CompileNext() {
RecompileJob* job = NULL; RecompileJob* job = NextInput();
bool result = input_queue_.Dequeue(&job); ASSERT_NE(NULL, job);
USE(result);
ASSERT(result);
Barrier_AtomicIncrement(&queue_length_, static_cast<Atomic32>(-1));
// The function may have already been optimized by OSR. Simply continue. // The function may have already been optimized by OSR. Simply continue.
RecompileJob::Status status = job->OptimizeGraph(); RecompileJob::Status status = job->OptimizeGraph();
@ -131,7 +139,7 @@ static void DisposeRecompileJob(RecompileJob* job,
void OptimizingCompilerThread::FlushInputQueue(bool restore_function_code) { void OptimizingCompilerThread::FlushInputQueue(bool restore_function_code) {
RecompileJob* job; RecompileJob* job;
while (input_queue_.Dequeue(&job)) { while ((job = NextInput())) {
// This should not block, since we have one signal on the input queue // This should not block, since we have one signal on the input queue
// semaphore corresponding to each element in the input queue. // semaphore corresponding to each element in the input queue.
input_queue_semaphore_.Wait(); input_queue_semaphore_.Wait();
@ -140,7 +148,6 @@ void OptimizingCompilerThread::FlushInputQueue(bool restore_function_code) {
DisposeRecompileJob(job, restore_function_code); DisposeRecompileJob(job, restore_function_code);
} }
} }
Release_Store(&queue_length_, static_cast<AtomicWord>(0));
} }
@ -156,12 +163,12 @@ void OptimizingCompilerThread::FlushOutputQueue(bool restore_function_code) {
void OptimizingCompilerThread::FlushOsrBuffer(bool restore_function_code) { void OptimizingCompilerThread::FlushOsrBuffer(bool restore_function_code) {
RecompileJob* job; for (int i = 0; i < osr_buffer_capacity_; i++) {
for (int i = 0; i < osr_buffer_size_; i++) { if (osr_buffer_[i] != NULL) {
job = osr_buffer_[i]; DisposeRecompileJob(osr_buffer_[i], restore_function_code);
if (job != NULL) DisposeRecompileJob(job, restore_function_code); osr_buffer_[i] = NULL;
}
} }
osr_cursor_ = 0;
} }
@ -187,10 +194,9 @@ void OptimizingCompilerThread::Stop() {
stop_semaphore_.Wait(); stop_semaphore_.Wait();
if (FLAG_concurrent_recompilation_delay != 0) { if (FLAG_concurrent_recompilation_delay != 0) {
// Barrier when loading queue length is not necessary since the write // At this point the optimizing compiler thread's event loop has stopped.
// happens in CompileNext on the same thread. // There is no need for a mutex when reading input_queue_length_.
// This is used only for testing. while (input_queue_length_ > 0) CompileNext();
while (NoBarrier_Load(&queue_length_) > 0) CompileNext();
InstallOptimizedFunctions(); InstallOptimizedFunctions();
} else { } else {
FlushInputQueue(false); FlushInputQueue(false);
@ -239,7 +245,6 @@ void OptimizingCompilerThread::InstallOptimizedFunctions() {
void OptimizingCompilerThread::QueueForOptimization(RecompileJob* job) { void OptimizingCompilerThread::QueueForOptimization(RecompileJob* job) {
ASSERT(IsQueueAvailable()); ASSERT(IsQueueAvailable());
ASSERT(!IsOptimizerThread()); ASSERT(!IsOptimizerThread());
Barrier_AtomicIncrement(&queue_length_, static_cast<Atomic32>(1));
CompilationInfo* info = job->info(); CompilationInfo* info = job->info();
if (info->is_osr()) { if (info->is_osr()) {
if (FLAG_trace_concurrent_recompilation) { if (FLAG_trace_concurrent_recompilation) {
@ -247,13 +252,24 @@ void OptimizingCompilerThread::QueueForOptimization(RecompileJob* job) {
info->closure()->PrintName(); info->closure()->PrintName();
PrintF(" for concurrent on-stack replacement.\n"); PrintF(" for concurrent on-stack replacement.\n");
} }
AddToOsrBuffer(job);
osr_attempts_++; osr_attempts_++;
BackEdgeTable::AddStackCheck(info); BackEdgeTable::AddStackCheck(info);
AddToOsrBuffer(job);
// Add job to the front of the input queue.
LockGuard<Mutex> access_input_queue(&input_queue_mutex_);
ASSERT_LT(input_queue_length_, input_queue_capacity_);
// Move shift_ back by one.
input_queue_shift_ = InputQueueIndex(input_queue_capacity_ - 1);
input_queue_[InputQueueIndex(0)] = job;
input_queue_length_++;
} else { } else {
info->closure()->MarkInRecompileQueue(); info->closure()->MarkInRecompileQueue();
// Add job to the back of the input queue.
LockGuard<Mutex> access_input_queue(&input_queue_mutex_);
ASSERT_LT(input_queue_length_, input_queue_capacity_);
input_queue_[InputQueueIndex(input_queue_length_)] = job;
input_queue_length_++;
} }
input_queue_.Enqueue(job);
if (FLAG_block_concurrent_recompilation) { if (FLAG_block_concurrent_recompilation) {
blocked_jobs_++; blocked_jobs_++;
} else { } else {
@ -274,15 +290,14 @@ void OptimizingCompilerThread::Unblock() {
RecompileJob* OptimizingCompilerThread::FindReadyOSRCandidate( RecompileJob* OptimizingCompilerThread::FindReadyOSRCandidate(
Handle<JSFunction> function, uint32_t osr_pc_offset) { Handle<JSFunction> function, uint32_t osr_pc_offset) {
ASSERT(!IsOptimizerThread()); ASSERT(!IsOptimizerThread());
RecompileJob* result = NULL; for (int i = 0; i < osr_buffer_capacity_; i++) {
for (int i = 0; i < osr_buffer_size_; i++) { RecompileJob* current = osr_buffer_[i];
result = osr_buffer_[i]; if (current != NULL &&
if (result == NULL) continue; current->IsWaitingForInstall() &&
if (result->IsWaitingForInstall() && current->info()->HasSameOsrEntry(function, osr_pc_offset)) {
result->info()->HasSameOsrEntry(function, osr_pc_offset)) {
osr_hits_++; osr_hits_++;
osr_buffer_[i] = NULL; osr_buffer_[i] = NULL;
return result; return current;
} }
} }
return NULL; return NULL;
@ -292,10 +307,11 @@ RecompileJob* OptimizingCompilerThread::FindReadyOSRCandidate(
bool OptimizingCompilerThread::IsQueuedForOSR(Handle<JSFunction> function, bool OptimizingCompilerThread::IsQueuedForOSR(Handle<JSFunction> function,
uint32_t osr_pc_offset) { uint32_t osr_pc_offset) {
ASSERT(!IsOptimizerThread()); ASSERT(!IsOptimizerThread());
for (int i = 0; i < osr_buffer_size_; i++) { for (int i = 0; i < osr_buffer_capacity_; i++) {
if (osr_buffer_[i] != NULL && RecompileJob* current = osr_buffer_[i];
osr_buffer_[i]->info()->HasSameOsrEntry(function, osr_pc_offset)) { if (current != NULL &&
return !osr_buffer_[i]->IsWaitingForInstall(); current->info()->HasSameOsrEntry(function, osr_pc_offset)) {
return !current->IsWaitingForInstall();
} }
} }
return false; return false;
@ -304,10 +320,10 @@ bool OptimizingCompilerThread::IsQueuedForOSR(Handle<JSFunction> function,
bool OptimizingCompilerThread::IsQueuedForOSR(JSFunction* function) { bool OptimizingCompilerThread::IsQueuedForOSR(JSFunction* function) {
ASSERT(!IsOptimizerThread()); ASSERT(!IsOptimizerThread());
for (int i = 0; i < osr_buffer_size_; i++) { for (int i = 0; i < osr_buffer_capacity_; i++) {
if (osr_buffer_[i] != NULL && RecompileJob* current = osr_buffer_[i];
*osr_buffer_[i]->info()->closure() == function) { if (current != NULL && *current->info()->closure() == function) {
return !osr_buffer_[i]->IsWaitingForInstall(); return !current->IsWaitingForInstall();
} }
} }
return false; return false;
@ -316,27 +332,27 @@ bool OptimizingCompilerThread::IsQueuedForOSR(JSFunction* function) {
void OptimizingCompilerThread::AddToOsrBuffer(RecompileJob* job) { void OptimizingCompilerThread::AddToOsrBuffer(RecompileJob* job) {
ASSERT(!IsOptimizerThread()); ASSERT(!IsOptimizerThread());
// Store into next empty slot or replace next stale OSR job that's waiting // Find the next slot that is empty or has a stale job.
// in vain. Dispose in the latter case.
RecompileJob* stale;
while (true) { while (true) {
stale = osr_buffer_[osr_cursor_]; RecompileJob* stale = osr_buffer_[osr_buffer_cursor_];
if (stale == NULL) break; if (stale == NULL || stale->IsWaitingForInstall()) break;
if (stale->IsWaitingForInstall()) { osr_buffer_cursor_ = (osr_buffer_cursor_ + 1) % osr_buffer_capacity_;
CompilationInfo* info = stale->info();
if (FLAG_trace_osr) {
PrintF("[COSR - Discarded ");
info->closure()->PrintName();
PrintF(", AST id %d]\n", info->osr_ast_id().ToInt());
}
DisposeRecompileJob(stale, false);
break;
}
AdvanceOsrCursor();
} }
osr_buffer_[osr_cursor_] = job; // Add to found slot and dispose the evicted job.
AdvanceOsrCursor(); RecompileJob* evicted = osr_buffer_[osr_buffer_cursor_];
if (evicted != NULL) {
ASSERT(evicted->IsWaitingForInstall());
CompilationInfo* info = evicted->info();
if (FLAG_trace_osr) {
PrintF("[COSR - Discarded ");
info->closure()->PrintName();
PrintF(", AST id %d]\n", info->osr_ast_id().ToInt());
}
DisposeRecompileJob(evicted, false);
}
osr_buffer_[osr_buffer_cursor_] = job;
osr_buffer_cursor_ = (osr_buffer_cursor_ + 1) % osr_buffer_capacity_;
} }

View File

@ -53,21 +53,29 @@ class OptimizingCompilerThread : public Thread {
isolate_(isolate), isolate_(isolate),
stop_semaphore_(0), stop_semaphore_(0),
input_queue_semaphore_(0), input_queue_semaphore_(0),
osr_cursor_(0), input_queue_capacity_(FLAG_concurrent_recompilation_queue_length),
input_queue_length_(0),
input_queue_shift_(0),
osr_buffer_capacity_(FLAG_concurrent_recompilation_queue_length + 4),
osr_buffer_cursor_(0),
osr_hits_(0), osr_hits_(0),
osr_attempts_(0), osr_attempts_(0),
blocked_jobs_(0) { blocked_jobs_(0) {
NoBarrier_Store(&stop_thread_, static_cast<AtomicWord>(CONTINUE)); NoBarrier_Store(&stop_thread_, static_cast<AtomicWord>(CONTINUE));
NoBarrier_Store(&queue_length_, static_cast<AtomicWord>(0)); input_queue_ = NewArray<RecompileJob*>(input_queue_capacity_);
if (FLAG_concurrent_osr) { osr_buffer_ = NewArray<RecompileJob*>(osr_buffer_capacity_);
osr_buffer_size_ = FLAG_concurrent_recompilation_queue_length + 4; // Mark OSR buffer slots as empty.
osr_buffer_ = NewArray<RecompileJob*>(osr_buffer_size_); for (int i = 0; i < osr_buffer_capacity_; i++) osr_buffer_[i] = NULL;
for (int i = 0; i < osr_buffer_size_; i++) osr_buffer_[i] = NULL;
}
} }
~OptimizingCompilerThread() { ~OptimizingCompilerThread() {
if (FLAG_concurrent_osr) DeleteArray(osr_buffer_); ASSERT_EQ(0, input_queue_length_);
#ifdef DEBUG
for (int i = 0; i < osr_buffer_capacity_; i++) {
CHECK_EQ(NULL, osr_buffer_[i]);
}
#endif
DeleteArray(osr_buffer_);
} }
void Run(); void Run();
@ -83,17 +91,15 @@ class OptimizingCompilerThread : public Thread {
bool IsQueuedForOSR(JSFunction* function); bool IsQueuedForOSR(JSFunction* function);
inline bool IsQueueAvailable() { inline bool IsQueueAvailable() {
// We don't need a barrier since we have a data dependency right LockGuard<Mutex> access_input_queue(&input_queue_mutex_);
// after. return input_queue_length_ < input_queue_capacity_;
Atomic32 current_length = NoBarrier_Load(&queue_length_); }
// This can be queried only from the execution thread. inline void AgeBufferedOsrJobs() {
ASSERT(!IsOptimizerThread()); // Advance cursor of the cyclic buffer to next empty slot or stale OSR job.
// Since only the execution thread increments queue_length_ and // Dispose said OSR job in the latter case. Calling this on every GC
// only one thread can run inside an Isolate at one time, a direct // should make sure that we do not hold onto stale jobs indefinitely.
// doesn't introduce a race -- queue_length_ may decreased in AddToOsrBuffer(NULL);
// meantime, but not increased.
return (current_length < FLAG_concurrent_recompilation_queue_length);
} }
#ifdef DEBUG #ifdef DEBUG
@ -107,12 +113,17 @@ class OptimizingCompilerThread : public Thread {
void FlushOutputQueue(bool restore_function_code); void FlushOutputQueue(bool restore_function_code);
void FlushOsrBuffer(bool restore_function_code); void FlushOsrBuffer(bool restore_function_code);
void CompileNext(); void CompileNext();
RecompileJob* NextInput();
// Add a recompilation task for OSR to the cyclic buffer, awaiting OSR entry. // Add a recompilation task for OSR to the cyclic buffer, awaiting OSR entry.
// Tasks evicted from the cyclic buffer are discarded. // Tasks evicted from the cyclic buffer are discarded.
void AddToOsrBuffer(RecompileJob* compiler); void AddToOsrBuffer(RecompileJob* compiler);
void AdvanceOsrCursor() {
osr_cursor_ = (osr_cursor_ + 1) % osr_buffer_size_; inline int InputQueueIndex(int i) {
int result = (i + input_queue_shift_) % input_queue_capacity_;
ASSERT_LE(0, result);
ASSERT_LT(result, input_queue_capacity_);
return result;
} }
#ifdef DEBUG #ifdef DEBUG
@ -124,20 +135,22 @@ class OptimizingCompilerThread : public Thread {
Semaphore stop_semaphore_; Semaphore stop_semaphore_;
Semaphore input_queue_semaphore_; Semaphore input_queue_semaphore_;
// Queue of incoming recompilation tasks (including OSR). // Circular queue of incoming recompilation tasks (including OSR).
UnboundQueue<RecompileJob*> input_queue_; RecompileJob** input_queue_;
int input_queue_capacity_;
int input_queue_length_;
int input_queue_shift_;
Mutex input_queue_mutex_;
// Queue of recompilation tasks ready to be installed (excluding OSR). // Queue of recompilation tasks ready to be installed (excluding OSR).
UnboundQueue<RecompileJob*> output_queue_; UnboundQueue<RecompileJob*> output_queue_;
// Cyclic buffer of recompilation tasks for OSR. // Cyclic buffer of recompilation tasks for OSR.
// TODO(yangguo): This may keep zombie tasks indefinitely, holding on to
// a lot of memory. Fix this.
RecompileJob** osr_buffer_; RecompileJob** osr_buffer_;
// Cursor for the cyclic buffer. int osr_buffer_capacity_;
int osr_cursor_; int osr_buffer_cursor_;
int osr_buffer_size_;
volatile AtomicWord stop_thread_; volatile AtomicWord stop_thread_;
volatile Atomic32 queue_length_;
TimeDelta time_spent_compiling_; TimeDelta time_spent_compiling_;
TimeDelta time_spent_total_; TimeDelta time_spent_total_;