aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Collingbourne <peter@pcc.me.uk>2019-01-22 23:54:49 +0000
committerPeter Collingbourne <peter@pcc.me.uk>2019-01-22 23:54:49 +0000
commite67de3fa5ebbf23fd0685e1c04e3e424f1d50986 (patch)
treebeaea1f91692810f587d3efc98914b318c739131
parentf2a1028469505507e44e9a8b67aa64b378a81572 (diff)
COFF, ELF: ICF: Perform 2 rounds of relocation hash propagation.
LLD's performance on PGO instrumented Windows binaries was still not great even with the fix in D56955; out of the 2m41s linker runtime, around 2 minutes were still being spent in ICF. I looked into this more closely and discovered that the vast majority of the runtime was being spent segregating .pdata sections with the following relocation chain: .pdata -> identical .text -> unique PGO counter (not eligible for ICF) This patch causes us to perform 2 rounds of relocation hash propagation, which allows the hash for the .pdata sections to incorporate the identifier from the PGO counter. With that, the amount of time spent in ICF was reduced to about 2 seconds. I also found that the same change led to a significant ICF performance improvement in a regular release build of Chromium's chrome_child.dll, where ICF time was reduced from around 1s to around 700ms. With the same change applied to the ELF linker, median of 100 runs for lld-speed-test/chrome reduced from 4.53s to 4.45s on my machine. I also experimented with increasing the number of propagation rounds further, but I did not observe any further significant performance improvements linking Chromium or Firefox. Differential Revision: https://reviews.llvm.org/D56986 git-svn-id: https://llvm.org/svn/llvm-project/lld/trunk@351899 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--COFF/ICF.cpp20
-rw-r--r--ELF/ICF.cpp25
2 files changed, 25 insertions, 20 deletions
diff --git a/COFF/ICF.cpp b/COFF/ICF.cpp
index 88e67443c..7684b2c80 100644
--- a/COFF/ICF.cpp
+++ b/COFF/ICF.cpp
@@ -262,19 +262,21 @@ void ICF::run(ArrayRef<Chunk *> Vec) {
// Initially, we use hash values to partition sections.
parallelForEach(Chunks, [&](SectionChunk *SC) {
- SC->Class[1] = xxHash64(SC->getContents());
+ SC->Class[0] = xxHash64(SC->getContents());
});
// Combine the hashes of the sections referenced by each section into its
// hash.
- parallelForEach(Chunks, [&](SectionChunk *SC) {
- uint32_t Hash = SC->Class[1];
- for (Symbol *B : SC->symbols())
- if (auto *Sym = dyn_cast_or_null<DefinedRegular>(B))
- Hash += Sym->getChunk()->Class[1];
- // Set MSB to 1 to avoid collisions with non-hash classs.
- SC->Class[0] = Hash | (1U << 31);
- });
+ for (unsigned Cnt = 0; Cnt != 2; ++Cnt) {
+ parallelForEach(Chunks, [&](SectionChunk *SC) {
+ uint32_t Hash = SC->Class[Cnt % 2];
+ for (Symbol *B : SC->symbols())
+ if (auto *Sym = dyn_cast_or_null<DefinedRegular>(B))
+ Hash += Sym->getChunk()->Class[Cnt % 2];
+ // Set MSB to 1 to avoid collisions with non-hash classs.
+ SC->Class[(Cnt + 1) % 2] = Hash | (1U << 31);
+ });
+ }
// From now on, sections in Chunks are ordered so that sections in
// the same group are consecutive in the vector.
diff --git a/ELF/ICF.cpp b/ELF/ICF.cpp
index 3f326fa33..7c07a42a8 100644
--- a/ELF/ICF.cpp
+++ b/ELF/ICF.cpp
@@ -425,16 +425,17 @@ void ICF<ELFT>::forEachClass(llvm::function_ref<void(size_t, size_t)> Fn) {
// Combine the hashes of the sections referenced by the given section into its
// hash.
template <class ELFT, class RelTy>
-static void combineRelocHashes(InputSection *IS, ArrayRef<RelTy> Rels) {
- uint32_t Hash = IS->Class[1];
+static void combineRelocHashes(unsigned Cnt, InputSection *IS,
+ ArrayRef<RelTy> Rels) {
+ uint32_t Hash = IS->Class[Cnt % 2];
for (RelTy Rel : Rels) {
Symbol &S = IS->template getFile<ELFT>()->getRelocTargetSym(Rel);
if (auto *D = dyn_cast<Defined>(&S))
if (auto *RelSec = dyn_cast_or_null<InputSection>(D->Section))
- Hash += RelSec->Class[1];
+ Hash += RelSec->Class[Cnt % 2];
}
// Set MSB to 1 to avoid collisions with non-hash IDs.
- IS->Class[0] = Hash | (1U << 31);
+ IS->Class[(Cnt + 1) % 2] = Hash | (1U << 31);
}
static void print(const Twine &S) {
@@ -452,15 +453,17 @@ template <class ELFT> void ICF<ELFT>::run() {
// Initially, we use hash values to partition sections.
parallelForEach(Sections, [&](InputSection *S) {
- S->Class[1] = xxHash64(S->data());
+ S->Class[0] = xxHash64(S->data());
});
- parallelForEach(Sections, [&](InputSection *S) {
- if (S->AreRelocsRela)
- combineRelocHashes<ELFT>(S, S->template relas<ELFT>());
- else
- combineRelocHashes<ELFT>(S, S->template rels<ELFT>());
- });
+ for (unsigned Cnt = 0; Cnt != 2; ++Cnt) {
+ parallelForEach(Sections, [&](InputSection *S) {
+ if (S->AreRelocsRela)
+ combineRelocHashes<ELFT>(Cnt, S, S->template relas<ELFT>());
+ else
+ combineRelocHashes<ELFT>(Cnt, S, S->template rels<ELFT>());
+ });
+ }
// From now on, sections in Sections vector are ordered so that sections
// in the same equivalence class are consecutive in the vector.