openjdk · May 3, 2021
diff --git a/‎src/hotspot/share/gc/shared/plab.hpp
+4 b/‎src/hotspot/share/gc/shared/plab.hpp
+4
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahAllocRequest.hpp
+11-1 b/‎src/hotspot/share/gc/shenandoah/shenandoahAllocRequest.hpp
+11-1
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.cpp
+4-1 b/‎src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.cpp
+4-1
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahFreeSet.cpp
+35-47 b/‎src/hotspot/share/gc/shenandoah/shenandoahFreeSet.cpp
+35-47
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahFreeSet.hpp
+1 b/‎src/hotspot/share/gc/shenandoah/shenandoahFreeSet.hpp
+1
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahFullGC.cpp
+1-1 b/‎src/hotspot/share/gc/shenandoah/shenandoahFullGC.cpp
+1-1
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
+100-13 b/‎src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
+100-13
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp
+7 b/‎src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp
+7
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahHeap.inline.hpp
+70-20 b/‎src/hotspot/share/gc/shenandoah/shenandoahHeap.inline.hpp
+70-20
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp
+20-4 b/‎src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp
+20-4
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.hpp
+4-2 b/‎src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.hpp
+4-2
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.inline.hpp
+6-3 b/‎src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.inline.hpp
+6-3
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahHeapRegionCounters.cpp
+1 b/‎src/hotspot/share/gc/shenandoah/shenandoahHeapRegionCounters.cpp
+1
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahHeapRegionCounters.hpp
+2-1 b/‎src/hotspot/share/gc/shenandoah/shenandoahHeapRegionCounters.hpp
+2-1
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahThreadLocalData.hpp
+21 b/‎src/hotspot/share/gc/shenandoah/shenandoahThreadLocalData.hpp
+21
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoahVerifier.cpp
+5-2 b/‎src/hotspot/share/gc/shenandoah/shenandoahVerifier.cpp
+5-2
diff --git a/‎src/hotspot/share/gc/shenandoah/shenandoah_globals.hpp
+4 b/‎src/hotspot/share/gc/shenandoah/shenandoah_globals.hpp
+4
@@ -140,6 +140,10 @@ class PLAB: public CHeapObj<mtGC> {
   // Fills in the unallocated portion of the buffer with a garbage object and updates
   // statistics. To be called during GC.
   void retire();
+
+  HeapWord* top() {
+    return _top;
+  }
 };
 
 // PLAB book-keeping.
 
@@ -32,9 +32,10 @@ class ShenandoahAllocRequest : StackObj {
 public:
   enum Type {
     _alloc_shared,      // Allocate common, outside of TLAB
-    _alloc_shared_gc,   // Allocate common, outside of GCLAB
+    _alloc_shared_gc,   // Allocate common, outside of GCLAB/PLAB
     _alloc_tlab,        // Allocate TLAB
     _alloc_gclab,       // Allocate GCLAB
+    _alloc_plab,        // Allocate PLAB
     _ALLOC_LIMIT
   };
 
@@ -48,6 +49,8 @@ class ShenandoahAllocRequest : StackObj {
         return "TLAB";
       case _alloc_gclab:
         return "GCLAB";
+      case _alloc_plab:
+        return "PLAB";
       default:
         ShouldNotReachHere();
         return "";
@@ -81,6 +84,10 @@ class ShenandoahAllocRequest : StackObj {
     return ShenandoahAllocRequest(min_size, requested_size, _alloc_gclab, ShenandoahRegionAffiliation::YOUNG_GENERATION);
   }
 
+  static inline ShenandoahAllocRequest for_plab(size_t min_size, size_t requested_size) {
+    return ShenandoahAllocRequest(min_size, requested_size, _alloc_plab, ShenandoahRegionAffiliation::OLD_GENERATION);
+  }
+
   static inline ShenandoahAllocRequest for_shared_gc(size_t requested_size, ShenandoahRegionAffiliation affiliation) {
     return ShenandoahAllocRequest(0, requested_size, _alloc_shared_gc, affiliation);
   }
@@ -125,6 +132,7 @@ class ShenandoahAllocRequest : StackObj {
       case _alloc_shared:
         return true;
       case _alloc_gclab:
+      case _alloc_plab:
       case _alloc_shared_gc:
         return false;
       default:
@@ -139,6 +147,7 @@ class ShenandoahAllocRequest : StackObj {
       case _alloc_shared:
         return false;
       case _alloc_gclab:
+      case _alloc_plab:
       case _alloc_shared_gc:
         return true;
       default:
@@ -151,6 +160,7 @@ class ShenandoahAllocRequest : StackObj {
     switch (_alloc_type) {
       case _alloc_tlab:
       case _alloc_gclab:
+      case _alloc_plab:
         return true;
       case _alloc_shared:
       case _alloc_shared_gc:
 
@@ -120,8 +120,11 @@ void ShenandoahBarrierSet::on_thread_detach(Thread *thread) {
       gclab->retire();
     }
 
-    // SATB protocol requires to keep alive reacheable oops from roots at the beginning of GC
     ShenandoahHeap* const heap = ShenandoahHeap::heap();
+    PLAB* plab = ShenandoahThreadLocalData::plab(thread);
+    heap->retire_plab(plab);
+
+    // SATB protocol requires to keep alive reacheable oops from roots at the beginning of GC
     if (heap->is_concurrent_mark_in_progress()) {
       ShenandoahKeepAliveClosure oops;
       StackWatermarkSet::finish_processing(thread->as_Java_thread(), &oops, StackWatermarkKind::gc);
 
@@ -35,7 +35,6 @@
 #include "memory/resourceArea.hpp"
 #include "runtime/orderAccess.hpp"
 
-
 ShenandoahFreeSet::ShenandoahFreeSet(ShenandoahHeap* heap, size_t max_regions) :
   _heap(heap),
   _mutator_free_bitmap(max_regions, mtGC),
@@ -65,6 +64,23 @@ bool ShenandoahFreeSet::is_collector_free(size_t idx) const {
   return _collector_free_bitmap.at(idx);
 }
 
+HeapWord* ShenandoahFreeSet::allocate_with_affiliation(ShenandoahRegionAffiliation affiliation, ShenandoahAllocRequest& req, bool& in_new_region) {
+  for (size_t c = _collector_rightmost + 1; c > _collector_leftmost; c--) {
+    // size_t is unsigned, need to dodge underflow when _leftmost = 0
+    size_t idx = c - 1;
+    if (is_collector_free(idx)) {
+      ShenandoahHeapRegion* r = _heap->get_region(idx);
+      if (r->affiliation() == affiliation) {
+        HeapWord* result = try_allocate_in(r, req, in_new_region);
+        if (result != NULL) {
+          return result;
+        }
+      }
+    }
+  }
+  return NULL;
+}
+
 HeapWord* ShenandoahFreeSet::allocate_single(ShenandoahAllocRequest& req, bool& in_new_region) {
   // Scan the bitmap looking for a first fit.
   //
@@ -96,64 +112,33 @@ HeapWord* ShenandoahFreeSet::allocate_single(ShenandoahAllocRequest& req, bool&
       break;
     }
     case ShenandoahAllocRequest::_alloc_gclab:
+    case ShenandoahAllocRequest::_alloc_plab:
     case ShenandoahAllocRequest::_alloc_shared_gc: {
-      // size_t is unsigned, need to dodge underflow when _leftmost = 0
-
-      // Fast-path: try to allocate in the collector view first
-      for (size_t c = _collector_rightmost + 1; c > _collector_leftmost; c--) {
-        size_t idx = c - 1;
-        if (is_collector_free(idx)) {
-          ShenandoahHeapRegion* r = _heap->get_region(idx);
-          if (r->is_young() && req.is_old()) {
-            // We don't want to cannibalize a young region to satisfy
-            // an evacuation from an old region.
-            continue;
-          }
-          HeapWord* result = try_allocate_in(r, req, in_new_region);
-          if (result != NULL) {
-            if (r->is_old()) {
-              // HEY! This is a very coarse card marking. We hope to repair
-              // such cards during remembered set scanning.
-
-              // HEY! To support full generality with alternative remembered set implementations,
-              // is preferable to not make direct access to the current card_table implementation.
-              //  Try ShenandoahHeap::heap()->card_scan()->mark_range_as_dirty(result, req.actual_size());
-
-              ShenandoahBarrierSet::barrier_set()->card_table()->dirty_MemRegion(MemRegion(result, req.actual_size()));
-            }
-            return result;
-          }
-        }
+      // First try to fit into a region that is already in use in the same generation.
+      HeapWord* result = allocate_with_affiliation(req.affiliation(), req, in_new_region);
+      if (result != NULL) {
+        return result;
+      }
+      // Then try a free region that is dedicated to GC allocations.
+      result = allocate_with_affiliation(FREE, req, in_new_region);
+      if (result != NULL) {
+        return result;
       }
 
       // No dice. Can we borrow space from mutator view?
       if (!ShenandoahEvacReserveOverflow) {
         return NULL;
       }
 
-      // Try to steal the empty region from the mutator view
+      // Try to steal an empty region from the mutator view.
       for (size_t c = _mutator_rightmost + 1; c > _mutator_leftmost; c--) {
         size_t idx = c - 1;
         if (is_mutator_free(idx)) {
           ShenandoahHeapRegion* r = _heap->get_region(idx);
           if (can_allocate_from(r)) {
-            if (r->is_young() && req.is_old()) {
-              continue;
-            }
-
             flip_to_gc(r);
             HeapWord *result = try_allocate_in(r, req, in_new_region);
             if (result != NULL) {
-              if (r->is_old()) {
-                // HEY! This is a very coarse card marking. We hope to repair
-                // such cards during remembered set scanning.
-
-                // HEY! To support full generality with alternative remembered set implementations,
-                // is preferable to not make direct access to the current card_table implementation.
-                //  Try ShenandoahHeap::heap()->card_scan()->mark_range_as_dirty(result, req.actual_size());
-
-                ShenandoahBarrierSet::barrier_set()->card_table()->dirty_MemRegion(MemRegion(result, req.actual_size()));
-              }
               return result;
             }
           }
@@ -163,13 +148,11 @@ HeapWord* ShenandoahFreeSet::allocate_single(ShenandoahAllocRequest& req, bool&
       // No dice. Do not try to mix mutator and GC allocations, because
       // URWM moves due to GC allocations would expose unparsable mutator
       // allocations.
-
       break;
     }
     default:
       ShouldNotReachHere();
   }
-
   return NULL;
 }
 
@@ -202,11 +185,11 @@ HeapWord* ShenandoahFreeSet::try_allocate_in(ShenandoahHeapRegion* r, Shenandoah
       size = free;
     }
     if (size >= req.min_size()) {
-      result = r->allocate(size, req.type());
+      result = r->allocate(size, req);
       assert (result != NULL, "Allocation must succeed: free " SIZE_FORMAT ", actual " SIZE_FORMAT, free, size);
     }
   } else {
-    result = r->allocate(size, req.type());
+    result = r->allocate(size, req);
   }
 
   if (result != NULL) {
@@ -439,6 +422,10 @@ void ShenandoahFreeSet::flip_to_gc(ShenandoahHeapRegion* r) {
     adjust_bounds();
   }
   assert_bounds();
+
+  // We do not ensure that the region is no longer trash,
+  // relying on try_allocate_in(), which always comes next,
+  // to recycle trash before attempting to allocate anything in the region.
 }
 
 void ShenandoahFreeSet::clear() {
@@ -599,6 +586,7 @@ HeapWord* ShenandoahFreeSet::allocate(ShenandoahAllocRequest& req, bool& in_new_
       case ShenandoahAllocRequest::_alloc_shared_gc:
         in_new_region = true;
         return allocate_contiguous(req);
+      case ShenandoahAllocRequest::_alloc_plab:
       case ShenandoahAllocRequest::_alloc_gclab:
       case ShenandoahAllocRequest::_alloc_tlab:
         in_new_region = false;
 
@@ -49,6 +49,7 @@ class ShenandoahFreeSet : public CHeapObj<mtGC> {
   bool is_collector_free(size_t idx) const;
 
   HeapWord* try_allocate_in(ShenandoahHeapRegion* region, ShenandoahAllocRequest& req, bool& in_new_region);
+  HeapWord* allocate_with_affiliation(ShenandoahRegionAffiliation affiliation, ShenandoahAllocRequest& req, bool& in_new_region);
   HeapWord* allocate_single(ShenandoahAllocRequest& req, bool& in_new_region);
   HeapWord* allocate_contiguous(ShenandoahAllocRequest& req);
 
 
@@ -331,7 +331,7 @@ class ShenandoahPrepareForCompactionObjectClosure : public ObjectClosure {
   void finish_region() {
     assert(_to_region != NULL, "should not happen");
     if (_heap->mode()->is_generational() && _to_region->affiliation() == FREE) {
-      // HEY! Changing this region to young during compaction may not be
+      // TODO: Changing this region to young during compaction may not be
       // technically correct here because it completely disregards the ages
       // and origins of the objects being moved. It is, however, certainly
       // more correct than putting live objects into a region without a
 
@@ -833,6 +833,67 @@ HeapWord* ShenandoahHeap::allocate_from_gclab_slow(Thread* thread, size_t size)
   return gclab->allocate(size);
 }
 
+HeapWord* ShenandoahHeap::allocate_from_plab_slow(Thread* thread, size_t size) {
+  // New object should fit the PLAB size
+  size_t min_size = MAX2(size, PLAB::min_size());
+
+  // Figure out size of new PLAB, looking back at heuristics. Expand aggressively.
+  size_t new_size = ShenandoahThreadLocalData::plab_size(thread) * 2;
+  new_size = MIN2(new_size, PLAB::max_size());
+  new_size = MAX2(new_size, PLAB::min_size());
+
+  // Record new heuristic value even if we take any shortcut. This captures
+  // the case when moderately-sized objects always take a shortcut. At some point,
+  // heuristics should catch up with them.
+  ShenandoahThreadLocalData::set_plab_size(thread, new_size);
+
+  if (new_size < size) {
+    // New size still does not fit the object. Fall back to shared allocation.
+    // This avoids retiring perfectly good PLABs, when we encounter a large object.
+    return NULL;
+  }
+
+  // Retire current PLAB, and allocate a new one.
+  PLAB* plab = ShenandoahThreadLocalData::plab(thread);
+  retire_plab(plab);
+
+  size_t actual_size = 0;
+  HeapWord* plab_buf = allocate_new_plab(min_size, new_size, &actual_size);
+  if (plab_buf == NULL) {
+    return NULL;
+  }
+
+  assert (size <= actual_size, "allocation should fit");
+
+  if (ZeroTLAB) {
+    // ..and clear it.
+    Copy::zero_to_words(plab_buf, actual_size);
+  } else {
+    // ...and zap just allocated object.
+#ifdef ASSERT
+    // Skip mangling the space corresponding to the object header to
+    // ensure that the returned space is not considered parsable by
+    // any concurrent GC thread.
+    size_t hdr_size = oopDesc::header_size();
+    Copy::fill_to_words(plab_buf + hdr_size, actual_size - hdr_size, badHeapWordVal);
+#endif // ASSERT
+  }
+  plab->set_buf(plab_buf, actual_size);
+  return plab->allocate(size);
+}
+
+void ShenandoahHeap::retire_plab(PLAB* plab) {
+  size_t waste = plab->waste();
+  HeapWord* top = plab->top();
+  plab->retire();
+  if (top != NULL && plab->waste() > waste) {
+    // If retiring the plab created a filler object, then we
+    // need to register it with our card scanner so it can
+    // safely walk the region backing the plab.
+    card_scan()->register_object(top);
+  }
+}
+
 HeapWord* ShenandoahHeap::allocate_new_tlab(size_t min_size,
                                             size_t requested_size,
                                             size_t* actual_size) {
@@ -859,6 +920,19 @@ HeapWord* ShenandoahHeap::allocate_new_gclab(size_t min_size,
   return res;
 }
 
+HeapWord* ShenandoahHeap::allocate_new_plab(size_t min_size,
+                                            size_t word_size,
+                                            size_t* actual_size) {
+  ShenandoahAllocRequest req = ShenandoahAllocRequest::for_plab(min_size, word_size);
+  HeapWord* res = allocate_memory(req);
+  if (res != NULL) {
+    *actual_size = req.actual_size();
+  } else {
+    *actual_size = 0;
+  }
+  return res;
+}
+
 HeapWord* ShenandoahHeap::allocate_memory(ShenandoahAllocRequest& req) {
   intptr_t pacer_epoch = 0;
   bool in_new_region = false;
@@ -954,23 +1028,25 @@ HeapWord* ShenandoahHeap::allocate_memory_under_lock(ShenandoahAllocRequest& req
   //
   // The thread allocating b and the thread allocating c can "race" in various ways, resulting in confusion, such as last-start
   // representing object b while first-start represents object c.  This is why we need to require all register_object()
-  // invocations to be "mutually exclusive".  Later, when we use GCLABs to allocate memory for promotions and evacuations,
+  // invocations to be "mutually exclusive".  Later, when we use GCLABs and PLABs to allocate memory for promotions and evacuations,
   // the protocol may work something like the following:
-  //   1. The GCLAB is allocated by this (or similar) function, while holding the global lock.
-  //   2. The GCLAB is registered as a single object.
-  ///  3. The GCLAB is always aligned at the start of a card memory range and is always a multiple of the card-table memory range size
-  //   3. Individual allocations carved from the GCLAB are not immediately registered
-  //   4. When the GCLAB is eventually retired, all of the objects allocated within the GCLAB are registered in batch by a
-  //      single thread.  No further synchronization is required because no other allocations will pertain to the same
+  //   1. The GCLAB/PLAB is allocated by this (or similar) function, while holding the global lock.
+  //   2. The GCLAB/PLAB is always aligned at the start of a card memory range
+  //      and is always a multiple of the card-table memory range size.
+  //   3. Individual allocations carved from a GCLAB/PLAB are not immediately registered.
+  //   4. A PLAB is registered as a single object.
+  //   5. When a PLAB is eventually retired, all of the objects allocated within the GCLAB/PLAB are registered in batch by a
+   //      single thread.  No further synchronization is required because no other allocations will pertain to the same
   //      card-table memory ranges.
   //
-  // The other case that needs special handling is promotion of regions en masse.  When the region is promoted, all objects contained
-  // within the region are registered.  Since the region is a multiple of card-table memory range sizes, there is no need for
-  // synchronization.  It might be nice to figure out how to allow multiple threads to work together to register all of the objects in
-  // a promoted region, or at least try to balance the efforts so that different gc threads work on registering the objects of
-  // different heap regions.  But that effort will come later.
+  // The other case that needs special handling is region promotion.  When a region is promoted, all objects contained
+  // in it are registered.  Since the region is a multiple of card table memory range sizes, there is no need for
+  // synchronization.
+  // TODO: figure out how to allow multiple threads to work together to register all of the objects in
+  // a promoted region, or at least try to balance the efforts so that different GC threads work
+  // on registering the objects of different heap regions.
   //
-  if (result != NULL && req.affiliation() == ShenandoahRegionAffiliation::OLD_GENERATION) {
+  if (mode()->is_generational() && result != NULL && req.affiliation() == ShenandoahRegionAffiliation::OLD_GENERATION) {
     ShenandoahHeap::heap()->card_scan()->register_object(result);
   }
   return result;
@@ -1140,6 +1216,10 @@ class ShenandoahCheckCleanGCLABClosure : public ThreadClosure {
     PLAB* gclab = ShenandoahThreadLocalData::gclab(thread);
     assert(gclab != NULL, "GCLAB should be initialized for %s", thread->name());
     assert(gclab->words_remaining() == 0, "GCLAB should not need retirement");
+
+    PLAB* plab = ShenandoahThreadLocalData::plab(thread);
+    assert(plab != NULL, "PLAB should be initialized for %s", thread->name());
+    assert(plab->words_remaining() == 0, "PLAB should not need retirement");
   }
 };
 
@@ -1155,6 +1235,13 @@ class ShenandoahRetireGCLABClosure : public ThreadClosure {
     if (_resize && ShenandoahThreadLocalData::gclab_size(thread) > 0) {
       ShenandoahThreadLocalData::set_gclab_size(thread, 0);
     }
+
+    PLAB* plab = ShenandoahThreadLocalData::plab(thread);
+    assert(plab != NULL, "PLAB should be initialized for %s", thread->name());
+    ShenandoahHeap::heap()->retire_plab(plab);
+    if (_resize && ShenandoahThreadLocalData::plab_size(thread) > 0) {
+      ShenandoahThreadLocalData::set_plab_size(thread, 0);
+    }
   }
 };
 
 
@@ -44,6 +44,7 @@
 
 class ConcurrentGCTimer;
 class ObjectIterateScanRootClosure;
+class PLAB;
 class ShenandoahCollectorPolicy;
 class ShenandoahControlThread;
 class ShenandoahRegulatorThread;
@@ -567,10 +568,15 @@ class ShenandoahHeap : public CollectedHeap {
 //
 private:
   HeapWord* allocate_memory_under_lock(ShenandoahAllocRequest& request, bool& in_new_region);
+
   inline HeapWord* allocate_from_gclab(Thread* thread, size_t size);
   HeapWord* allocate_from_gclab_slow(Thread* thread, size_t size);
   HeapWord* allocate_new_gclab(size_t min_size, size_t word_size, size_t* actual_size);
 
+  inline HeapWord* allocate_from_plab(Thread* thread, size_t size);
+  HeapWord* allocate_from_plab_slow(Thread* thread, size_t size);
+  HeapWord* allocate_new_plab(size_t min_size, size_t word_size, size_t* actual_size);
+
 public:
   HeapWord* allocate_memory(ShenandoahAllocRequest& request);
   HeapWord* mem_allocate(size_t size, bool* what);
@@ -676,6 +682,7 @@ class ShenandoahHeap : public CollectedHeap {
   inline RememberedScanner* card_scan() { return _card_scan; }
   void clear_cards_for(ShenandoahHeapRegion* region);
   void mark_card_as_dirty(HeapWord* location);
+  void retire_plab(PLAB* plab);
 
 // ---------- Helper functions
 //
 
@@ -42,6 +42,7 @@
 #include "gc/shenandoah/shenandoahMarkingContext.inline.hpp"
 #include "gc/shenandoah/shenandoahScanRemembered.inline.hpp"
 #include "gc/shenandoah/shenandoahThreadLocalData.hpp"
+#include "gc/shenandoah/shenandoahScanRemembered.inline.hpp"
 #include "gc/shenandoah/mode/shenandoahMode.hpp"
 #include "oops/compressedOops.inline.hpp"
 #include "oops/oop.inline.hpp"
@@ -211,10 +212,31 @@ inline HeapWord* ShenandoahHeap::allocate_from_gclab(Thread* thread, size_t size
   if (obj != NULL) {
     return obj;
   }
-  // Otherwise...
   return allocate_from_gclab_slow(thread, size);
 }
 
+inline HeapWord* ShenandoahHeap::allocate_from_plab(Thread* thread, size_t size) {
+  assert(UseTLAB, "TLABs should be enabled");
+
+  PLAB* plab = ShenandoahThreadLocalData::plab(thread);
+  if (plab == NULL) {
+    assert(!thread->is_Java_thread() && !thread->is_Worker_thread(),
+           "Performance: thread should have PLAB: %s", thread->name());
+    // No PLABs in this thread, fallback to shared allocation
+    return NULL;
+  }
+  HeapWord* obj = plab->allocate(size);
+  if (obj == NULL) {
+    obj = allocate_from_plab_slow(thread, size);
+  }
+
+  if (mode()->is_generational() && obj != NULL) {
+    ShenandoahHeap::heap()->card_scan()->register_object(obj);
+  }
+
+  return obj;
+}
+
 inline oop ShenandoahHeap::evacuate_object(oop p, Thread* thread) {
   if (ShenandoahThreadLocalData::is_oom_during_evac(Thread::current())) {
     // This thread went through the OOM during evac protocol and it is safe to return
@@ -241,13 +263,6 @@ inline oop ShenandoahHeap::evacuate_object(oop p, Thread* thread) {
     } else if (mark.age() >= InitialTenuringThreshold) {
       oop result = try_evacuate_object(p, thread, r, OLD_GENERATION);
       if (result != NULL) {
-        // TODO: Just marking the cards covering this object dirty
-        // may overall be less efficient than scanning it now for references to young gen
-        // or other alternatives like deferred card marking or scanning.
-        // We should revisit this.
-        // Furthermore, the object start should be registered for remset scanning.
-        MemRegion mr(cast_from_oop<HeapWord*>(result), result->size());
-        ShenandoahBarrierSet::barrier_set()->card_table()->invalidate(mr);
         return result;
       }
     }
@@ -256,7 +271,7 @@ inline oop ShenandoahHeap::evacuate_object(oop p, Thread* thread) {
 }
 
 inline oop ShenandoahHeap::try_evacuate_object(oop p, Thread* thread, ShenandoahHeapRegion* from_region, ShenandoahRegionAffiliation target_gen) {
-  bool alloc_from_gclab = true;
+  bool alloc_from_lab = true;
   HeapWord* copy = NULL;
   size_t size = p->size();
 
@@ -266,13 +281,28 @@ inline oop ShenandoahHeap::try_evacuate_object(oop p, Thread* thread, Shenandoah
         copy = NULL;
   } else {
 #endif
-    if (UseTLAB && target_gen == YOUNG_GENERATION) {
-      copy = allocate_from_gclab(thread, size);
+    if (UseTLAB) {
+      switch (target_gen) {
+        case YOUNG_GENERATION: {
+           copy = allocate_from_gclab(thread, size);
+           break;
+        }
+        case OLD_GENERATION: {
+           if (ShenandoahUsePLAB) {
+             copy = allocate_from_plab(thread, size);
+           }
+           break;
+        }
+        default: {
+          ShouldNotReachHere();
+          break;
+        }
+      }
     }
     if (copy == NULL) {
       ShenandoahAllocRequest req = ShenandoahAllocRequest::for_shared_gc(size, target_gen);
       copy = allocate_memory(req);
-      alloc_from_gclab = false;
+      alloc_from_lab = false;
     }
 #ifdef ASSERT
   }
@@ -311,6 +341,10 @@ inline oop ShenandoahHeap::try_evacuate_object(oop p, Thread* thread, Shenandoah
   // Try to install the new forwarding pointer.
   oop result = ShenandoahForwarding::try_update_forwardee(p, copy_val);
   if (result == copy_val) {
+    if (target_gen == OLD_GENERATION) {
+      ShenandoahBarrierSet::barrier_set()->card_table()->dirty_MemRegion(MemRegion(copy, size));
+      card_scan()->register_object(copy);
+    }
     // Successfully evacuated. Our copy is now the public one!
     shenandoah_assert_correct(NULL, copy_val);
     return copy_val;
@@ -320,17 +354,33 @@ inline oop ShenandoahHeap::try_evacuate_object(oop p, Thread* thread, Shenandoah
     // But if it happens to contain references to evacuated regions, those references would
     // not get updated for this stale copy during this cycle, and we will crash while scanning
     // it the next cycle.
-    //
-    // For GCLAB allocations, it is enough to rollback the allocation ptr. Either the next
-    // object will overwrite this stale copy, or the filler object on LAB retirement will
-    // do this. For non-GCLAB allocations, we have no way to retract the allocation, and
-    // have to explicitly overwrite the copy with the filler object. With that overwrite,
-    // we have to keep the fwdptr initialized and pointing to our (stale) copy.
-    if (alloc_from_gclab) {
-      ShenandoahThreadLocalData::gclab(thread)->undo_allocation(copy, size);
+    if (alloc_from_lab) {
+       // For LAB allocations, it is enough to rollback the allocation ptr. Either the next
+       // object will overwrite this stale copy, or the filler object on LAB retirement will
+       // do this.
+       switch (target_gen) {
+         case YOUNG_GENERATION: {
+             ShenandoahThreadLocalData::gclab(thread)->undo_allocation(copy, size);
+            break;
+         }
+         case OLD_GENERATION: {
+            ShenandoahThreadLocalData::plab(thread)->undo_allocation(copy, size);
+            break;
+         }
+         default: {
+           ShouldNotReachHere();
+           break;
+         }
+       }
     } else {
+      // For non-LAB allocations, we have no way to retract the allocation, and
+      // have to explicitly overwrite the copy with the filler object. With that overwrite,
+      // we have to keep the fwdptr initialized and pointing to our (stale) copy.
       fill_with_object(copy, size);
       shenandoah_assert_correct(NULL, copy_val);
+      if (target_gen == OLD_GENERATION) {
+        card_scan()->register_object(copy);
+      }
     }
     shenandoah_assert_correct(NULL, result);
     return result;
 
@@ -70,10 +70,11 @@ ShenandoahHeapRegion::ShenandoahHeapRegion(HeapWord* start, size_t index, bool c
   _top(start),
   _tlab_allocs(0),
   _gclab_allocs(0),
+  _plab_allocs(0),
   _live_data(0),
   _critical_pins(0),
   _update_watermark(start),
-  _affiliation(ShenandoahRegionAffiliation::FREE),
+  _affiliation(FREE),
   _age(0) {
 
   assert(Universe::on_page_boundary(_bottom) && Universe::on_page_boundary(_end),
@@ -91,13 +92,14 @@ void ShenandoahHeapRegion::report_illegal_transition(const char *method) {
   fatal("%s", ss.as_string());
 }
 
-void ShenandoahHeapRegion::make_regular_allocation() {
+void ShenandoahHeapRegion::make_regular_allocation(ShenandoahRegionAffiliation affiliation) {
   shenandoah_assert_heaplocked();
   reset_age();
   switch (_state) {
     case _empty_uncommitted:
       do_commit();
     case _empty_committed:
+      set_affiliation(affiliation);
       set_state(_regular);
     case _regular:
     case _pinned:
@@ -119,6 +121,12 @@ void ShenandoahHeapRegion::make_regular_bypass() {
     case _cset:
     case _humongous_start:
     case _humongous_cont:
+      // TODO: Changing this region to young during compaction may not be
+      // technically correct here because it completely disregards the ages
+      // and origins of the objects being moved. It is, however, certainly
+      // more correct than putting live objects into a region without a
+      // generational affiliation.
+      set_affiliation(YOUNG_GENERATION);
       set_state(_regular);
       return;
     case _pinned_cset:
@@ -220,6 +228,7 @@ void ShenandoahHeapRegion::make_unpinned() {
 
   switch (_state) {
     case _pinned:
+      assert(affiliation() != FREE, "Pinned region should not be FREE");
       set_state(_regular);
       return;
     case _regular:
@@ -318,10 +327,11 @@ void ShenandoahHeapRegion::make_committed_bypass() {
 void ShenandoahHeapRegion::reset_alloc_metadata() {
   _tlab_allocs = 0;
   _gclab_allocs = 0;
+  _plab_allocs = 0;
 }
 
 size_t ShenandoahHeapRegion::get_shared_allocs() const {
-  return used() - (_tlab_allocs + _gclab_allocs) * HeapWordSize;
+  return used() - (_tlab_allocs + _gclab_allocs + _plab_allocs) * HeapWordSize;
 }
 
 size_t ShenandoahHeapRegion::get_tlab_allocs() const {
@@ -332,6 +342,10 @@ size_t ShenandoahHeapRegion::get_gclab_allocs() const {
   return _gclab_allocs * HeapWordSize;
 }
 
+size_t ShenandoahHeapRegion::get_plab_allocs() const {
+  return _plab_allocs * HeapWordSize;
+}
+
 void ShenandoahHeapRegion::set_live_data(size_t s) {
   assert(Thread::current()->is_VM_thread(), "by VM thread");
   _live_data = (s >> LogHeapWordSize);
@@ -397,6 +411,9 @@ void ShenandoahHeapRegion::print_on(outputStream* st) const {
   st->print("|U " SIZE_FORMAT_W(5) "%1s", byte_size_in_proper_unit(used()),                proper_unit_for_byte_size(used()));
   st->print("|T " SIZE_FORMAT_W(5) "%1s", byte_size_in_proper_unit(get_tlab_allocs()),     proper_unit_for_byte_size(get_tlab_allocs()));
   st->print("|G " SIZE_FORMAT_W(5) "%1s", byte_size_in_proper_unit(get_gclab_allocs()),    proper_unit_for_byte_size(get_gclab_allocs()));
+  if (ShenandoahHeap::heap()->mode()->is_generational()) {
+    st->print("|G " SIZE_FORMAT_W(5) "%1s", byte_size_in_proper_unit(get_plab_allocs()),   proper_unit_for_byte_size(get_plab_allocs()));
+  }
   st->print("|S " SIZE_FORMAT_W(5) "%1s", byte_size_in_proper_unit(get_shared_allocs()),   proper_unit_for_byte_size(get_shared_allocs()));
   st->print("|L " SIZE_FORMAT_W(5) "%1s", byte_size_in_proper_unit(get_live_data_bytes()), proper_unit_for_byte_size(get_live_data_bytes()));
   st->print("|CP " SIZE_FORMAT_W(3), pin_count());
@@ -533,7 +550,6 @@ void ShenandoahHeapRegion::recycle() {
   set_update_watermark(bottom());
 
   make_empty();
-
   set_affiliation(FREE);
 
   if (ZapUnusedHeapArea) {
 
@@ -168,7 +168,7 @@ class ShenandoahHeapRegion {
   }
 
   // Allowed transitions from the outside code:
-  void make_regular_allocation();
+  void make_regular_allocation(ShenandoahRegionAffiliation affiliation);
   void make_regular_bypass();
   void make_humongous_start();
   void make_humongous_cont();
@@ -242,6 +242,7 @@ class ShenandoahHeapRegion {
 
   size_t _tlab_allocs;
   size_t _gclab_allocs;
+  size_t _plab_allocs;
 
   volatile size_t _live_data;
   volatile size_t _critical_pins;
@@ -339,7 +340,7 @@ class ShenandoahHeapRegion {
   }
 
   // Allocation (return NULL if full)
-  inline HeapWord* allocate(size_t word_size, ShenandoahAllocRequest::Type type);
+  inline HeapWord* allocate(size_t word_size, ShenandoahAllocRequest req);
 
   inline void clear_live_data();
   void set_live_data(size_t s);
@@ -391,6 +392,7 @@ class ShenandoahHeapRegion {
   size_t get_shared_allocs() const;
   size_t get_tlab_allocs() const;
   size_t get_gclab_allocs() const;
+  size_t get_plab_allocs() const;
 
   inline HeapWord* get_update_watermark() const;
   inline void set_update_watermark(HeapWord* w);
 
@@ -30,14 +30,14 @@
 #include "gc/shenandoah/shenandoahPacer.inline.hpp"
 #include "runtime/atomic.hpp"
 
-HeapWord* ShenandoahHeapRegion::allocate(size_t size, ShenandoahAllocRequest::Type type) {
+HeapWord* ShenandoahHeapRegion::allocate(size_t size, ShenandoahAllocRequest req) {
   shenandoah_assert_heaplocked_or_safepoint();
   assert(is_object_aligned(size), "alloc size breaks alignment: " SIZE_FORMAT, size);
 
   HeapWord* obj = top();
   if (pointer_delta(end(), obj) >= size) {
-    make_regular_allocation();
-    adjust_alloc_metadata(type, size);
+    make_regular_allocation(req.affiliation());
+    adjust_alloc_metadata(req.type(), size);
 
     HeapWord* new_top = obj + size;
     set_top(new_top);
@@ -63,6 +63,9 @@ inline void ShenandoahHeapRegion::adjust_alloc_metadata(ShenandoahAllocRequest::
     case ShenandoahAllocRequest::_alloc_gclab:
       _gclab_allocs += size;
       break;
+    case ShenandoahAllocRequest::_alloc_plab:
+      _plab_allocs += size;
+      break;
     default:
       ShouldNotReachHere();
   }
 
@@ -111,6 +111,7 @@ void ShenandoahHeapRegionCounters::update() {
           data |= ((100 * r->get_live_data_bytes() / rs) & PERCENT_MASK) << LIVE_SHIFT;
           data |= ((100 * r->get_tlab_allocs() / rs)     & PERCENT_MASK) << TLAB_SHIFT;
           data |= ((100 * r->get_gclab_allocs() / rs)    & PERCENT_MASK) << GCLAB_SHIFT;
+          data |= ((100 * r->get_plab_allocs() / rs)     & PERCENT_MASK) << PLAB_SHIFT;
           data |= ((100 * r->get_shared_allocs() / rs)   & PERCENT_MASK) << SHARED_SHIFT;
 
           data |= (r->age() & AGE_MASK) << AGE_SHIFT;
 
@@ -51,7 +51,7 @@
  * - bits 14-20  tlab allocated memory in percent
  * - bits 21-27  gclab allocated memory in percent
  * - bits 28-34  shared allocated memory in percent
- * - bits 35-41  <reserved>
+ * - bits 35-41  plab allocated memory in percent
  * - bits 42-50  <reserved>
  * - bits 51-55  age
  * - bits 56-57  affiliation: 0 = free, young = 1, old = 2
@@ -70,6 +70,7 @@ class ShenandoahHeapRegionCounters : public CHeapObj<mtGC>  {
   static const jlong TLAB_SHIFT        = 14;
   static const jlong GCLAB_SHIFT       = 21;
   static const jlong SHARED_SHIFT      = 28;
+  static const jlong PLAB_SHIFT        = 35;
   static const jlong AGE_SHIFT         = 51;
   static const jlong AFFILIATION_SHIFT = 56;
   static const jlong STATUS_SHIFT      = 58;
 
@@ -47,6 +47,8 @@ class ShenandoahThreadLocalData {
   SATBMarkQueue           _satb_mark_queue;
   PLAB* _gclab;
   size_t _gclab_size;
+  PLAB* _plab;
+  size_t _plab_size;
   uint  _worker_id;
   int  _disarmed_value;
   double _paced_time;
@@ -58,6 +60,8 @@ class ShenandoahThreadLocalData {
     _satb_mark_queue(&ShenandoahBarrierSet::satb_mark_queue_set()),
     _gclab(NULL),
     _gclab_size(0),
+    _plab(NULL),
+    _plab_size(0),
     _worker_id(INVALID_WORKER_ID),
     _disarmed_value(0),
     _paced_time(0) {
@@ -71,6 +75,9 @@ class ShenandoahThreadLocalData {
     if (_gclab != NULL) {
       delete _gclab;
     }
+    if (_plab != NULL) {
+      delete _plab;
+    }
   }
 
   static ShenandoahThreadLocalData* data(Thread* thread) {
@@ -118,6 +125,8 @@ class ShenandoahThreadLocalData {
     assert(data(thread)->_gclab == NULL, "Only initialize once");
     data(thread)->_gclab = new PLAB(PLAB::min_size());
     data(thread)->_gclab_size = 0;
+    data(thread)->_plab = new PLAB(PLAB::min_size());
+    data(thread)->_plab_size = 0;
   }
 
   static PLAB* gclab(Thread* thread) {
@@ -132,6 +141,18 @@ class ShenandoahThreadLocalData {
     data(thread)->_gclab_size = v;
   }
 
+  static PLAB* plab(Thread* thread) {
+    return data(thread)->_plab;
+  }
+
+  static size_t plab_size(Thread* thread) {
+    return data(thread)->_plab_size;
+  }
+
+  static void set_plab_size(Thread* thread, size_t v) {
+    data(thread)->_plab_size = v;
+  }
+
   static void add_paced_time(Thread* thread, double v) {
     data(thread)->_paced_time += v;
   }
 
@@ -435,8 +435,11 @@ class ShenandoahVerifyHeapRegionClosure : public ShenandoahHeapRegionClosure {
     verify(r, r->get_gclab_allocs() <= r->capacity(),
            "GCLAB alloc count should not be larger than capacity");
 
-    verify(r, r->get_shared_allocs() + r->get_tlab_allocs() + r->get_gclab_allocs() == r->used(),
-           "Accurate accounting: shared + TLAB + GCLAB = used");
+    verify(r, r->get_plab_allocs() <= r->capacity(),
+           "PLAB alloc count should not be larger than capacity");
+
+    verify(r, r->get_shared_allocs() + r->get_tlab_allocs() + r->get_gclab_allocs() + r->get_plab_allocs() == r->used(),
+           "Accurate accounting: shared + TLAB + GCLAB + PLAB = used");
 
     verify(r, !r->is_empty() || !r->has_live(),
            "Empty regions should not have live data");
 
@@ -224,6 +224,10 @@
   product(bool, ShenandoahElasticTLAB, true, DIAGNOSTIC,                    \
           "Use Elastic TLABs with Shenandoah")                              \
                                                                             \
+  product(bool, ShenandoahUsePLAB, true, DIAGNOSTIC,                        \
+          "Use PLABs for object promotions with Shenandoah, "               \
+          "if in generational mode and UseTLAB is also set.")               \
+                                                                            \
   product(uintx, ShenandoahEvacReserve, 5, EXPERIMENTAL,                    \
           "How much of heap to reserve for evacuations. Larger values make "\
           "GC evacuate more live objects on every cycle, while leaving "    \