2013-10-16 13:02:15 +00:00
|
|
|
#include "DMTask.h"
|
|
|
|
#include "DMTaskRunner.h"
|
2014-07-22 17:15:34 +00:00
|
|
|
#include "SkCommonFlags.h"
|
2014-05-30 17:23:31 +00:00
|
|
|
|
2013-10-16 13:02:15 +00:00
|
|
|
namespace DM {
|
|
|
|
|
|
|
|
Task::Task(Reporter* reporter, TaskRunner* taskRunner)
|
2014-02-28 20:31:31 +00:00
|
|
|
: fReporter(reporter)
|
|
|
|
, fTaskRunner(taskRunner)
|
|
|
|
, fDepth(0) {
|
2014-05-29 20:14:48 +00:00
|
|
|
fReporter->taskCreated();
|
2013-10-16 13:02:15 +00:00
|
|
|
}
|
|
|
|
|
2013-12-02 13:50:38 +00:00
|
|
|
Task::Task(const Task& parent)
|
2014-02-28 20:31:31 +00:00
|
|
|
: fReporter(parent.fReporter)
|
2013-12-02 13:50:38 +00:00
|
|
|
, fTaskRunner(parent.fTaskRunner)
|
2014-02-28 20:31:31 +00:00
|
|
|
, fDepth(parent.depth() + 1) {
|
2014-05-29 20:14:48 +00:00
|
|
|
fReporter->taskCreated();
|
|
|
|
}
|
|
|
|
|
|
|
|
Task::~Task() {
|
|
|
|
fReporter->taskDestroyed();
|
2013-10-16 13:02:15 +00:00
|
|
|
}
|
|
|
|
|
2014-02-28 20:31:31 +00:00
|
|
|
void Task::fail(const char* msg) {
|
|
|
|
SkString failure(this->name());
|
|
|
|
if (msg) {
|
|
|
|
failure.appendf(": %s", msg);
|
2013-10-16 13:02:15 +00:00
|
|
|
}
|
2014-02-28 20:31:31 +00:00
|
|
|
fReporter->fail(failure);
|
|
|
|
}
|
|
|
|
|
2014-03-03 15:44:56 +00:00
|
|
|
void Task::start() {
|
|
|
|
fStart = SkTime::GetMSecs();
|
|
|
|
}
|
|
|
|
|
2014-02-28 20:31:31 +00:00
|
|
|
void Task::finish() {
|
2014-05-29 20:14:48 +00:00
|
|
|
fReporter->printStatus(this->name(), SkTime::GetMSecs() - fStart);
|
2013-10-16 13:02:15 +00:00
|
|
|
}
|
|
|
|
|
SkThreadPool ~~> SkTaskGroup
SkTaskGroup is like SkThreadPool except the threads stay in
one global pool. Each SkTaskGroup itself is tiny (4 bytes)
and its wait() method applies only to tasks add()ed to that
instance, not the whole thread pool.
This means we don't need to bring up new thread pools when
tests themselves want to use multithreading (e.g. pathops,
quilt). We just create a new SkTaskGroup and wait for that
to complete. This should be more efficient, and allow us
to expand where we use threads to really latency sensitive
places. E.g. we can probably now use these in nanobench
for CPU .skp rendering.
Now that all threads are sharing the same pool, I think we
can remove most of the custom mechanism pathops tests use
to control threading. They'll just ride on the global pool
with all other tests now.
This (temporarily?) removes the GPU multithreading feature
from DM, which we don't use.
On my desktop, DM runs a little faster (57s -> 55s) in
Debug, and a lot faster in Release (36s -> 24s). The bots
show speedups of similar proportions, cutting more than a
minute off the N4/Release and Win7/Debug runtimes.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/9c7207b5dc71dc5a96a2eb107d401133333d5b6f
R=caryclark@google.com, bsalomon@google.com, bungeman@google.com, mtklein@google.com, reed@google.com
Author: mtklein@chromium.org
Review URL: https://codereview.chromium.org/531653002
2014-09-03 22:34:37 +00:00
|
|
|
void Task::reallySpawnChild(CpuTask* task) {
|
|
|
|
fTaskRunner->add(task);
|
2013-10-16 13:02:15 +00:00
|
|
|
}
|
|
|
|
|
2014-02-28 20:31:31 +00:00
|
|
|
CpuTask::CpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
|
|
|
|
CpuTask::CpuTask(const Task& parent) : Task(parent) {}
|
|
|
|
|
|
|
|
void CpuTask::run() {
|
2014-09-17 19:26:18 +00:00
|
|
|
// If the task says skip, or if we're starting a top-level CPU task and we don't want to, skip.
|
|
|
|
const bool skip = this->shouldSkip() || (this->depth() == 0 && !FLAGS_cpu);
|
|
|
|
if (!skip) {
|
2014-05-29 20:14:48 +00:00
|
|
|
this->start();
|
2014-05-30 17:23:31 +00:00
|
|
|
if (!FLAGS_dryRun) this->draw();
|
2014-05-29 20:14:48 +00:00
|
|
|
this->finish();
|
2014-02-26 16:31:22 +00:00
|
|
|
}
|
2014-02-28 20:31:31 +00:00
|
|
|
SkDELETE(this);
|
2013-10-16 13:02:15 +00:00
|
|
|
}
|
|
|
|
|
2014-04-30 20:47:30 +00:00
|
|
|
void CpuTask::spawnChild(CpuTask* task) {
|
|
|
|
// Run children serially on this (CPU) thread. This tends to save RAM and is usually no slower.
|
SkThreadPool ~~> SkTaskGroup
SkTaskGroup is like SkThreadPool except the threads stay in
one global pool. Each SkTaskGroup itself is tiny (4 bytes)
and its wait() method applies only to tasks add()ed to that
instance, not the whole thread pool.
This means we don't need to bring up new thread pools when
tests themselves want to use multithreading (e.g. pathops,
quilt). We just create a new SkTaskGroup and wait for that
to complete. This should be more efficient, and allow us
to expand where we use threads to really latency sensitive
places. E.g. we can probably now use these in nanobench
for CPU .skp rendering.
Now that all threads are sharing the same pool, I think we
can remove most of the custom mechanism pathops tests use
to control threading. They'll just ride on the global pool
with all other tests now.
This (temporarily?) removes the GPU multithreading feature
from DM, which we don't use.
On my desktop, DM runs a little faster (57s -> 55s) in
Debug, and a lot faster in Release (36s -> 24s). The bots
show speedups of similar proportions, cutting more than a
minute off the N4/Release and Win7/Debug runtimes.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/9c7207b5dc71dc5a96a2eb107d401133333d5b6f
R=caryclark@google.com, bsalomon@google.com, bungeman@google.com, mtklein@google.com, reed@google.com
Author: mtklein@chromium.org
Review URL: https://codereview.chromium.org/531653002
2014-09-03 22:34:37 +00:00
|
|
|
// Calling reallySpawnChild() is nearly equivalent, but it'd pointlessly contend on the
|
|
|
|
// threadpool; reallySpawnChild() is most useful when you want to change threadpools.
|
2014-04-30 20:47:30 +00:00
|
|
|
task->run();
|
|
|
|
}
|
|
|
|
|
2014-02-28 20:31:31 +00:00
|
|
|
GpuTask::GpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
|
|
|
|
|
SkThreadPool ~~> SkTaskGroup
SkTaskGroup is like SkThreadPool except the threads stay in
one global pool. Each SkTaskGroup itself is tiny (4 bytes)
and its wait() method applies only to tasks add()ed to that
instance, not the whole thread pool.
This means we don't need to bring up new thread pools when
tests themselves want to use multithreading (e.g. pathops,
quilt). We just create a new SkTaskGroup and wait for that
to complete. This should be more efficient, and allow us
to expand where we use threads to really latency sensitive
places. E.g. we can probably now use these in nanobench
for CPU .skp rendering.
Now that all threads are sharing the same pool, I think we
can remove most of the custom mechanism pathops tests use
to control threading. They'll just ride on the global pool
with all other tests now.
This (temporarily?) removes the GPU multithreading feature
from DM, which we don't use.
On my desktop, DM runs a little faster (57s -> 55s) in
Debug, and a lot faster in Release (36s -> 24s). The bots
show speedups of similar proportions, cutting more than a
minute off the N4/Release and Win7/Debug runtimes.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/9c7207b5dc71dc5a96a2eb107d401133333d5b6f
R=caryclark@google.com, bsalomon@google.com, bungeman@google.com, mtklein@google.com, reed@google.com
Author: mtklein@chromium.org
Review URL: https://codereview.chromium.org/531653002
2014-09-03 22:34:37 +00:00
|
|
|
void GpuTask::run(GrContextFactory* factory) {
|
2014-09-17 19:26:18 +00:00
|
|
|
// If the task says skip, or if we're starting a top-level GPU task and we don't want to, skip.
|
|
|
|
const bool skip = this->shouldSkip() || (this->depth() == 0 && !FLAGS_gpu);
|
|
|
|
if (!skip) {
|
2014-05-29 20:14:48 +00:00
|
|
|
this->start();
|
SkThreadPool ~~> SkTaskGroup
SkTaskGroup is like SkThreadPool except the threads stay in
one global pool. Each SkTaskGroup itself is tiny (4 bytes)
and its wait() method applies only to tasks add()ed to that
instance, not the whole thread pool.
This means we don't need to bring up new thread pools when
tests themselves want to use multithreading (e.g. pathops,
quilt). We just create a new SkTaskGroup and wait for that
to complete. This should be more efficient, and allow us
to expand where we use threads to really latency sensitive
places. E.g. we can probably now use these in nanobench
for CPU .skp rendering.
Now that all threads are sharing the same pool, I think we
can remove most of the custom mechanism pathops tests use
to control threading. They'll just ride on the global pool
with all other tests now.
This (temporarily?) removes the GPU multithreading feature
from DM, which we don't use.
On my desktop, DM runs a little faster (57s -> 55s) in
Debug, and a lot faster in Release (36s -> 24s). The bots
show speedups of similar proportions, cutting more than a
minute off the N4/Release and Win7/Debug runtimes.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/9c7207b5dc71dc5a96a2eb107d401133333d5b6f
R=caryclark@google.com, bsalomon@google.com, bungeman@google.com, mtklein@google.com, reed@google.com
Author: mtklein@chromium.org
Review URL: https://codereview.chromium.org/531653002
2014-09-03 22:34:37 +00:00
|
|
|
if (!FLAGS_dryRun) this->draw(factory);
|
2014-05-29 20:14:48 +00:00
|
|
|
this->finish();
|
2014-07-28 20:48:36 +00:00
|
|
|
if (FLAGS_abandonGpuContext) {
|
SkThreadPool ~~> SkTaskGroup
SkTaskGroup is like SkThreadPool except the threads stay in
one global pool. Each SkTaskGroup itself is tiny (4 bytes)
and its wait() method applies only to tasks add()ed to that
instance, not the whole thread pool.
This means we don't need to bring up new thread pools when
tests themselves want to use multithreading (e.g. pathops,
quilt). We just create a new SkTaskGroup and wait for that
to complete. This should be more efficient, and allow us
to expand where we use threads to really latency sensitive
places. E.g. we can probably now use these in nanobench
for CPU .skp rendering.
Now that all threads are sharing the same pool, I think we
can remove most of the custom mechanism pathops tests use
to control threading. They'll just ride on the global pool
with all other tests now.
This (temporarily?) removes the GPU multithreading feature
from DM, which we don't use.
On my desktop, DM runs a little faster (57s -> 55s) in
Debug, and a lot faster in Release (36s -> 24s). The bots
show speedups of similar proportions, cutting more than a
minute off the N4/Release and Win7/Debug runtimes.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/9c7207b5dc71dc5a96a2eb107d401133333d5b6f
R=caryclark@google.com, bsalomon@google.com, bungeman@google.com, mtklein@google.com, reed@google.com
Author: mtklein@chromium.org
Review URL: https://codereview.chromium.org/531653002
2014-09-03 22:34:37 +00:00
|
|
|
factory->abandonContexts();
|
2014-07-28 20:48:36 +00:00
|
|
|
}
|
|
|
|
if (FLAGS_resetGpuContext || FLAGS_abandonGpuContext) {
|
SkThreadPool ~~> SkTaskGroup
SkTaskGroup is like SkThreadPool except the threads stay in
one global pool. Each SkTaskGroup itself is tiny (4 bytes)
and its wait() method applies only to tasks add()ed to that
instance, not the whole thread pool.
This means we don't need to bring up new thread pools when
tests themselves want to use multithreading (e.g. pathops,
quilt). We just create a new SkTaskGroup and wait for that
to complete. This should be more efficient, and allow us
to expand where we use threads to really latency sensitive
places. E.g. we can probably now use these in nanobench
for CPU .skp rendering.
Now that all threads are sharing the same pool, I think we
can remove most of the custom mechanism pathops tests use
to control threading. They'll just ride on the global pool
with all other tests now.
This (temporarily?) removes the GPU multithreading feature
from DM, which we don't use.
On my desktop, DM runs a little faster (57s -> 55s) in
Debug, and a lot faster in Release (36s -> 24s). The bots
show speedups of similar proportions, cutting more than a
minute off the N4/Release and Win7/Debug runtimes.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/9c7207b5dc71dc5a96a2eb107d401133333d5b6f
R=caryclark@google.com, bsalomon@google.com, bungeman@google.com, mtklein@google.com, reed@google.com
Author: mtklein@chromium.org
Review URL: https://codereview.chromium.org/531653002
2014-09-03 22:34:37 +00:00
|
|
|
factory->destroyContexts();
|
2014-07-15 20:10:02 +00:00
|
|
|
}
|
2014-02-28 20:31:31 +00:00
|
|
|
}
|
|
|
|
SkDELETE(this);
|
2014-02-26 23:01:57 +00:00
|
|
|
}
|
|
|
|
|
2014-04-30 20:47:30 +00:00
|
|
|
void GpuTask::spawnChild(CpuTask* task) {
|
SkThreadPool ~~> SkTaskGroup
SkTaskGroup is like SkThreadPool except the threads stay in
one global pool. Each SkTaskGroup itself is tiny (4 bytes)
and its wait() method applies only to tasks add()ed to that
instance, not the whole thread pool.
This means we don't need to bring up new thread pools when
tests themselves want to use multithreading (e.g. pathops,
quilt). We just create a new SkTaskGroup and wait for that
to complete. This should be more efficient, and allow us
to expand where we use threads to really latency sensitive
places. E.g. we can probably now use these in nanobench
for CPU .skp rendering.
Now that all threads are sharing the same pool, I think we
can remove most of the custom mechanism pathops tests use
to control threading. They'll just ride on the global pool
with all other tests now.
This (temporarily?) removes the GPU multithreading feature
from DM, which we don't use.
On my desktop, DM runs a little faster (57s -> 55s) in
Debug, and a lot faster in Release (36s -> 24s). The bots
show speedups of similar proportions, cutting more than a
minute off the N4/Release and Win7/Debug runtimes.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/9c7207b5dc71dc5a96a2eb107d401133333d5b6f
R=caryclark@google.com, bsalomon@google.com, bungeman@google.com, mtklein@google.com, reed@google.com
Author: mtklein@chromium.org
Review URL: https://codereview.chromium.org/531653002
2014-09-03 22:34:37 +00:00
|
|
|
// Spawn a new task so it runs on the CPU threadpool instead of the GPU one we're on now.
|
DM: Push GPU-parent child tasks to the front of the queue.
Like yesterday's change to run CPU-parent child tasks serially in thread, this
reduces peak memory usage by improving the temporaly locality of the bitmaps we
create.
E.g. Let's say we start with tasks A B C and D
Queue: [ A B C D ]
Running A creates A' and A", which depend on a bitmap created by A.
Queue: [ B C D A' A" * ]
That bitmap now needs sit around in RAM while B C and D run pointlessly and can
only be destroyed at *. If instead we do this and push dependent child tasks
to the front of the queue, the queue and bitmap lifetime looks like this:
Queue: [ A' A" * B C D ]
This is much, much worse in practice because the queue is often several thousand
tasks long. 100s of megs of bitmaps can pile up for 10s of seconds pointlessly.
To make this work we add addNext() to SkThreadPool and its cousin DMTaskRunner.
I also took the opportunity to swap head and tail in the threadpool
implementation so it matches the comments and intuition better: we always pop
the head, add() puts it at the tail, addNext() at the head.
Before
Debug: 49s, 1403352k peak
Release: 16s, 2064008k peak
After
Debug: 49s, 1234788k peak
Release: 15s, 1903424k peak
BUG=skia:2478
R=bsalomon@google.com, borenet@google.com, mtklein@google.com
Author: mtklein@chromium.org
Review URL: https://codereview.chromium.org/263803003
git-svn-id: http://skia.googlecode.com/svn/trunk@14506 2bbb7eff-a529-9590-31e7-b0007b416f81
2014-05-01 17:41:32 +00:00
|
|
|
// It goes on the front of the queue to minimize the time we must hold reference bitmaps in RAM.
|
SkThreadPool ~~> SkTaskGroup
SkTaskGroup is like SkThreadPool except the threads stay in
one global pool. Each SkTaskGroup itself is tiny (4 bytes)
and its wait() method applies only to tasks add()ed to that
instance, not the whole thread pool.
This means we don't need to bring up new thread pools when
tests themselves want to use multithreading (e.g. pathops,
quilt). We just create a new SkTaskGroup and wait for that
to complete. This should be more efficient, and allow us
to expand where we use threads to really latency sensitive
places. E.g. we can probably now use these in nanobench
for CPU .skp rendering.
Now that all threads are sharing the same pool, I think we
can remove most of the custom mechanism pathops tests use
to control threading. They'll just ride on the global pool
with all other tests now.
This (temporarily?) removes the GPU multithreading feature
from DM, which we don't use.
On my desktop, DM runs a little faster (57s -> 55s) in
Debug, and a lot faster in Release (36s -> 24s). The bots
show speedups of similar proportions, cutting more than a
minute off the N4/Release and Win7/Debug runtimes.
BUG=skia:
Committed: https://skia.googlesource.com/skia/+/9c7207b5dc71dc5a96a2eb107d401133333d5b6f
R=caryclark@google.com, bsalomon@google.com, bungeman@google.com, mtklein@google.com, reed@google.com
Author: mtklein@chromium.org
Review URL: https://codereview.chromium.org/531653002
2014-09-03 22:34:37 +00:00
|
|
|
this->reallySpawnChild(task);
|
2014-04-30 20:47:30 +00:00
|
|
|
}
|
2014-02-28 20:31:31 +00:00
|
|
|
|
2013-10-16 13:02:15 +00:00
|
|
|
} // namespace DM
|