/*
 * Copyright 2016 Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * Higher performance (up to 10x) atomic increment using thread caching.
 *
 * @author Spencer Ahrens (sahrens)
 */

#pragma once

#include <atomic>

#include <boost/noncopyable.hpp>

#include <folly/Likely.h>
#include <folly/ThreadLocal.h>

namespace folly {


// Note that readFull requires holding a lock and iterating through all of the
// thread local objects with the same Tag, so if you have a lot of
// ThreadCachedInt's you should considering breaking up the Tag space even
// further.
template <class IntT, class Tag=IntT>
class ThreadCachedInt : boost::noncopyable {
  struct IntCache;

 public:
  explicit ThreadCachedInt(IntT initialVal = 0, uint32_t cacheSize = 1000)
    : target_(initialVal), cacheSize_(cacheSize) {
  }

  void increment(IntT inc) {
    auto cache = cache_.get();
    if (UNLIKELY(cache == nullptr || cache->parent_ == nullptr)) {
      cache = new IntCache(*this);
      cache_.reset(cache);
    }
    cache->increment(inc);
  }

  // Quickly grabs the current value which may not include some cached
  // increments.
  IntT readFast() const {
    return target_.load(std::memory_order_relaxed);
  }

  // Reads the current value plus all the cached increments.  Requires grabbing
  // a lock, so this is significantly slower than readFast().
  IntT readFull() const {
    // This could race with thread destruction and so the access lock should be
    // acquired before reading the current value
    auto accessor = cache_.accessAllThreads();
    IntT ret = readFast();
    for (const auto& cache : accessor) {
      if (!cache.reset_.load(std::memory_order_acquire)) {
        ret += cache.val_.load(std::memory_order_relaxed);
      }
    }
    return ret;
  }

  // Quickly reads and resets current value (doesn't reset cached increments).
  IntT readFastAndReset() {
    return target_.exchange(0, std::memory_order_release);
  }

  // This function is designed for accumulating into another counter, where you
  // only want to count each increment once.  It can still get the count a
  // little off, however, but it should be much better than calling readFull()
  // and set(0) sequentially.
  IntT readFullAndReset() {
    // This could race with thread destruction and so the access lock should be
    // acquired before reading the current value
    auto accessor = cache_.accessAllThreads();
    IntT ret = readFastAndReset();
    for (auto& cache : accessor) {
      if (!cache.reset_.load(std::memory_order_acquire)) {
        ret += cache.val_.load(std::memory_order_relaxed);
        cache.reset_.store(true, std::memory_order_release);
      }
    }
    return ret;
  }

  void setCacheSize(uint32_t newSize) {
    cacheSize_.store(newSize, std::memory_order_release);
  }

  uint32_t getCacheSize() const {
    return cacheSize_.load();
  }

  ThreadCachedInt& operator+=(IntT inc) { increment(inc); return *this; }
  ThreadCachedInt& operator-=(IntT inc) { increment(-inc); return *this; }
  // pre-increment (we don't support post-increment)
  ThreadCachedInt& operator++() { increment(1); return *this; }
  ThreadCachedInt& operator--() { increment(-1); return *this; }

  // Thread-safe set function.
  // This is a best effort implementation. In some edge cases, there could be
  // data loss (missing counts)
  void set(IntT newVal) {
    for (auto& cache : cache_.accessAllThreads()) {
      cache.reset_.store(true, std::memory_order_release);
    }
    target_.store(newVal, std::memory_order_release);
  }

  // This is a little tricky - it's possible that our IntCaches are still alive
  // in another thread and will get destroyed after this destructor runs, so we
  // need to make sure we signal that this parent is dead.
  ~ThreadCachedInt() {
    for (auto& cache : cache_.accessAllThreads()) {
      cache.parent_ = nullptr;
    }
  }

 private:
  std::atomic<IntT> target_;
  std::atomic<uint32_t> cacheSize_;
  ThreadLocalPtr<IntCache, Tag, AccessModeStrict>
      cache_; // Must be last for dtor ordering

  // This should only ever be modified by one thread
  struct IntCache {
    ThreadCachedInt* parent_;
    mutable std::atomic<IntT> val_;
    mutable uint32_t numUpdates_;
    std::atomic<bool> reset_;

    explicit IntCache(ThreadCachedInt& parent)
        : parent_(&parent), val_(0), numUpdates_(0), reset_(false) {}

    void increment(IntT inc) {
      if (LIKELY(!reset_.load(std::memory_order_acquire))) {
        // This thread is the only writer to val_, so it's fine do do
        // a relaxed load and do the addition non-atomically.
        val_.store(
          val_.load(std::memory_order_relaxed) + inc,
          std::memory_order_release
        );
      } else {
        val_.store(inc, std::memory_order_relaxed);
        reset_.store(false, std::memory_order_release);
      }
      ++numUpdates_;
      if (UNLIKELY(numUpdates_ >
                   parent_->cacheSize_.load(std::memory_order_acquire))) {
        flush();
      }
    }

    void flush() const {
      parent_->target_.fetch_add(val_, std::memory_order_release);
      val_.store(0, std::memory_order_release);
      numUpdates_ = 0;
    }

    ~IntCache() {
      if (parent_) {
        flush();
      }
    }
  };
};

}