/* -*- C++ -*- */

/*
  The Hoard Multiprocessor Memory Allocator
  www.hoard.org

  Author: Emery Berger, http://www.cs.umass.edu/~emery
 
  Copyright (c) 1998-2004, The University of Texas at Austin

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
  
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/

/*

  Compile with:

  cl /LD /I../../heaplayers /nologo /Ox /DNDEBUG /D_MT /D_DLL /DWIN32 /D_WIN32 /D_WINDOWS winhoard.cpp /link /subsystem:console /dll /incremental:no

  and link usewinhoard.cpp with your executable.

 */


#include <windows.h>

#define WIN32_LEAN_AND_MEAN
#define _WIN32_WINNT 0x0500

#pragma inline_depth(255)

#pragma warning(disable: 4273)
#pragma warning(disable: 4098)  // Library conflict.
#pragma warning(disable: 4355)  // 'this' used in base member initializer list.
#pragma warning(disable: 4074)	// initializers put in compiler reserved area.

#pragma init_seg(compiler)
#pragma comment(linker, "/merge:.CRT=.data")
#pragma comment(linker, "/disallowlib:libc.lib")
#pragma comment(linker, "/disallowlib:libcd.lib")
#pragma comment(linker, "/disallowlib:libcmt.lib")
#pragma comment(linker, "/disallowlib:libcmtd.lib")
#pragma comment(linker, "/disallowlib:msvcrtd.lib")

void (*hoard_memcpy_ptr)(void *dest, const void *src, size_t count);
void (*hoard_memset_ptr)(void *dest, int c, size_t count);


// Disable lock optimization.
volatile int anyThreadCreated = 1;

/// The maximum amount of memory that each TLAB may hold, in bytes.
enum { MAX_MEMORY_PER_TLAB = 64 * 1024 };

/// The maximum number of threads supported (sort of).
enum { MaxThreads = 1024 };

/// The maximum number of heaps supported.
enum { NumHeaps = 128 };

#include "computethreadstacksize.h"
#include "cpuinfo.h"
#include "hoard.h"
#include "heapmanager.h"
#include "tlab.h"

class TheCustomHeapType :
  public HeapManager<TheLockType, HoardHeap<MaxThreads, NumHeaps> > {};

/// Return the custom (Hoard) heap.

inline static TheCustomHeapType * getCustomHeap (void) {
  // This function is C++ magic to ensure that the heap is initialized
  // before its first use.

  // Allocate a static buffer to hold the heap.
  static double thBuf[sizeof(TheCustomHeapType) / sizeof(double) + 1];

  // Now initialize the heap into that buffer.
  static TheCustomHeapType * th = new (thBuf) TheCustomHeapType;
  return th;
}

typedef ThreadLocalAllocationBuffer<bins<NoHeader, SUPERBLOCK_SIZE>::NUM_BINS,
				    bins<NoHeader, SUPERBLOCK_SIZE>::getSizeClass,
				    bins<NoHeader, SUPERBLOCK_SIZE>::getClassSize,
				    MAX_MEMORY_PER_TLAB,
				    TheCustomHeapType::SuperblockType,
				    SUPERBLOCK_SIZE,
				    TheCustomHeapType::PerThreadHeap> TLAB;

__declspec(thread) double tlabBuf[sizeof(TLAB) / sizeof(double) + 1];
__declspec(thread) TLAB * tlab;


static TLAB * getTLABslowPath (void) {
  tlab = new (tlabBuf) TLAB (&getCustomHeap()->getHeap());
  return tlab;
}

static __forceinline TLAB * getTLAB (void) {
  // We can just use thread-specific data here.
  if (tlab != NULL) {
    return tlab;
  } else {
    return getTLABslowPath();
  }
}


extern "C" size_t hoard_getsize (void * ptr)
{
  static TheCustomHeapType * theCustomHeap = getCustomHeap();
  if (ptr == NULL) {
    return 0;
  }
  return theCustomHeap->getSize(ptr);
}

// Intercept the exit functions.

static const int HOARD_MAX_EXIT_FUNCTIONS = 255;

static int exitCount = 0;

extern "C" {

  typedef void (*exitFunctionType) (void);
  exitFunctionType exitFunctionBuffer[255];

  void hoard_onexit (void (*function)(void)) {
    if (exitCount < HOARD_MAX_EXIT_FUNCTIONS) {
      exitFunctionBuffer[exitCount] = function;
      exitCount++;
    }
  }

  void hoard_exit (int code) {
    while (exitCount > 0) {
      exitCount--;
      (exitFunctionBuffer[exitCount])();
    }
  }
}

extern "C" void * hoard_malloc (size_t sz) {

  if (sz < 2 * sizeof(size_t)) {
    // Make sure it's at least big enough to hold two pointers. 
    sz = 2 * sizeof(size_t);
  }

#if 0
  // Not needed because all requests will be rounded up in the TLAB.

  // Align to a double-word boundary.
  sz = (sz + sizeof(double) - 1) & ~(sizeof(double) - 1);
#endif

  // Allocate small objects locally.
  if (sz <= TheCustomHeapType::BIG_OBJECT) {
    // Use the TLAB, if we haven't created a stack yet.
    TLAB * t = getTLAB();
    return t->malloc (sz);
  }

  {
    // Otherwise, just use the base heap.
    // Wrapped in braces here to avoid the static check
    // when not needed.
    static TheCustomHeapType * heap = getCustomHeap();
    return heap->malloc (sz);
  }
}


extern "C" void hoard_free (void * ptr) {

  // Use the TLAB for small objects, and if we haven't created a thread stack.
  size_t sz = getCustomHeap()->getSize (ptr);
  if (sz <= TheCustomHeapType::BIG_OBJECT) {
    TLAB * t = getTLAB();
    t->free (ptr);
  } else {
    static TheCustomHeapType * heap = getCustomHeap();
    heap->free (ptr);
  }
}

/*** below are generic replacement functions for the malloc family ***/

extern "C" void * hoard_calloc (size_t nelem, size_t elsize)
{
  size_t n = nelem * elsize;
  void * ptr = hoard_malloc (n);
  // Zero out the malloc'd block.
  if (ptr != NULL) {
    (hoard_memset_ptr) (ptr, 0, n);
  }
  return ptr;
}


extern "C" char * hoard_strdup (const char * s)
{
  char * newString = NULL;
  if (s != NULL) {
    if ((newString = (char *) hoard_malloc(strlen(s) + 1))) {
      strcpy(newString, s);
    }
  }
  return newString;
}


extern "C" void * hoard_realloc (void * ptr, size_t sz)
{
  static TheCustomHeapType * theCustomHeap = getCustomHeap();
  if (ptr == NULL) {
    ptr = theCustomHeap->malloc (sz);
    return ptr;
  }
  if (sz == 0) {
    theCustomHeap->free (ptr);
    return NULL;
  }

  size_t objSize = theCustomHeap->getSize(ptr);
  if (objSize >= sz) {
    return ptr;
  }
  void * buf = theCustomHeap->malloc ((size_t) (sz));

  if (buf != NULL) {
    // Copy the contents of the original object
    // up to the size of the new block.
    size_t minSize = (objSize < sz) ? objSize : sz;
    (hoard_memcpy_ptr) (buf, ptr, minSize);
  }

  // Free the old block.
  theCustomHeap->free(ptr);

  // Return a pointer to the new one.
  return buf;
}

//const char *RlsCRTLibraryName = "MSVCRT.DLL";
const char *RlsCRTLibraryName = "MSVCR71.DLL";
const char *DbgCRTLibraryName = "MSVCRTD.DLL";

#define IAX86_NEARJMP_OPCODE	  0xe9
#define MakeIAX86Offset(to,from)  ((unsigned)((char*)(to)-(char*)(from)) - 5)

typedef struct
{
  const char *import;		// import name of patch routine
  FARPROC replacement;		// pointer to replacement function
  FARPROC original;		// pointer to original function
  unsigned char codebytes[5];	// 5 bytes of original code storage
} PATCH;


/* ------------------------------------------------------------------------ */

static PATCH rls_patches[] = 
  {
    // RELEASE CRT library routines supported by this memory manager.

#if 0
    {"_expand",		(FARPROC) hoard__expand,	0},
    {"_heapchk",	(FARPROC) hoard__heapchk,	0},
    {"_heapmin",	(FARPROC) hoard__heapmin,	0},
    {"_heapset",	(FARPROC) hoard__heapset,	0},
    {"_heapwalk",	(FARPROC) hoard__heapwalk,	0},
#endif

    {"_onexit",         (FARPROC) hoard_onexit,    0},
    {"_exit",           (FARPROC) hoard_exit,      0},

    // operator new, new[], delete, delete[].

    {"??2@YAPAXI@Z",    (FARPROC) hoard_malloc,    0},
    {"??_U@YAPAXI@Z",   (FARPROC) hoard_malloc,    0},
    {"??3@YAXPAX@Z",    (FARPROC) hoard_free,      0},
    {"??_V@YAXPAX@Z",   (FARPROC) hoard_free,      0},

    // the nothrow variants new, new[].

    {"??2@YAPAXIABUnothrow_t@std@@@Z",  (FARPROC) hoard_malloc, 0},
    {"??_U@YAPAXIABUnothrow_t@std@@@Z", (FARPROC) hoard_malloc, 0},

    {"_msize",	(FARPROC) hoard_getsize,		0},
    {"calloc",	(FARPROC) hoard_calloc,		0},
    {"malloc",	(FARPROC) hoard_malloc,		0},
    {"realloc",	(FARPROC) hoard_realloc,		0},
    {"free",	(FARPROC) hoard_free,              0},
  };

#ifdef _DEBUG
static PATCH dbg_patches[] = 
  {
    // DEBUG CRT library routines supported by this memory manager.

    {"_calloc_dbg",               (FARPROC) hoard__calloc_dbg,0},
    {"_CrtCheckMemory",	          (FARPROC) hoard__CrtCheckMemory,	0},
    {"_CrtDoForAllClientObjects", (FARPROC) hoard__CrtDoForAllClientObjects, 0},
    {"_CrtDumpMemoryLeaks",       (FARPROC) hoard__CrtDumpMemoryLeaks, 0},
    {"_CrtIsMemoryBlock",         (FARPROC) hoard__CrtIsMemoryBlock, 0},
    {"_CrtIsValidHeapPointer",	  (FARPROC) hoard__CrtIsValidHeapPointer, 0},
    {"_CrtMemCheckpoint",         (FARPROC) hoard__CrtMemCheckpoint, 0},
    {"_CrtMemDifference",         (FARPROC) hoard__CrtMemDifference, 0},
    {"_CrtMemDumpAllObjectsSince",(FARPROC) hoard__CrtMemDumpAllObjectsSince, 0},
    {"_CrtMemDumpStatistics",	  (FARPROC) hoard__CrtMemDumpStatistics, 0},
    {"_CrtSetAllocHook",	  (FARPROC) hoard__CrtSetAllocHook, 0},
    {"_CrtSetBreakAlloc",         (FARPROC) hoard__CrtSetBreakAlloc,0},
    {"_CrtSetDbgFlag",	          (FARPROC) hoard__CrtSetDbgFlag, 0},
    {"_CrtSetDumpClient",(FARPROC) hoard__CrtSetDumpClient, 0},
    {"_expand",		 (FARPROC) hoard__expand, 0},
    {"_expand_dbg",      (FARPROC) hoard__expand_dbg, 0},
    {"_free_dbg",	 (FARPROC) hoard__free_dbg, 0},
    {"_malloc_dbg",      (FARPROC) hoard__malloc_dbg, 0},
    {"_msize",		 (FARPROC) hoard__msize, 0},
    {"_msize_dbg",	 (FARPROC) hoard__msize_dbg, 0},
    {"_realloc_dbg",     (FARPROC) hoard__realloc_dbg, 0},
    {"_heapchk",	 (FARPROC) hoard__heapchk,	0},
    {"_heapmin",	 (FARPROC) hoard__heapmin,	0},
    {"_heapset",	 (FARPROC) hoard__heapset,	0},
    {"_heapwalk",	 (FARPROC) hoard__heapwalk, 0},
    {"_msize",		 (FARPROC) hoard__msize, 0},
    {"calloc",		 (FARPROC) hoard_calloc, 0},
    {"malloc",		 (FARPROC) hoard_malloc, 0},
    {"realloc",		 (FARPROC) hoard_realloc, 0},
    {"free",             (FARPROC) hoard_free, 0},

    // operator new, new[], delete, delete[].

    {"??2@YAPAXI@Z",     (FARPROC) hoard_malloc, 0},
    {"??_U@YAPAXI@Z",    (FARPROC) hoard_malloc, 0},
    {"??3@YAXPAX@Z",     (FARPROC) hoard_free,   0},
    {"??_V@YAXPAX@Z",    (FARPROC) hoard_free,   0},

    // the nothrow variants new, new[].

    {"??2@YAPAXIABUnothrow_t@std@@@Z",  (FARPROC) hoard_new_nothrow, 0},
    {"??_U@YAPAXIABUnothrow_t@std@@@Z", (FARPROC) hoard_new_nothrow, 0},

    // The debug versions of operator new & delete.

    {"??2@YAPAXIHPBDH@Z", (FARPROC) hoard_debug_operator_new, 0},
    {"??3@YAXPAXHPBDH@Z", (FARPROC) hoard_debug_operator_delete, 0},
    // And the nh_malloc_foo.

    {"_nh_malloc_dbg",   (FARPROC)hoard_nh_malloc_dbg, 0},
  };
#endif


static void PatchIt (PATCH *patch)
{
  // Change rights on CRT Library module to execute/read/write.

  MEMORY_BASIC_INFORMATION mbi_thunk;
  VirtualQuery((void*)patch->original, &mbi_thunk, 
	       sizeof(MEMORY_BASIC_INFORMATION));
  VirtualProtect(mbi_thunk.BaseAddress, mbi_thunk.RegionSize, 
		 PAGE_EXECUTE_READWRITE, &mbi_thunk.Protect);

  // Patch CRT library original routine:
  // 	save original 5 code bytes for exit restoration
  //		write jmp <patch_routine> (5 bytes long) to original.

  memcpy(patch->codebytes, patch->original, sizeof(patch->codebytes));
  unsigned char *patchloc = (unsigned char*)patch->original;
  *patchloc++ = IAX86_NEARJMP_OPCODE;
  *(unsigned*)patchloc = MakeIAX86Offset(patch->replacement, patch->original);
	
  // Reset CRT library code to original page protection.

  VirtualProtect(mbi_thunk.BaseAddress, mbi_thunk.RegionSize, 
		 mbi_thunk.Protect, &mbi_thunk.Protect);
}


static bool PatchMeIn (void)
{
  // acquire the module handles for the CRT libraries (release and debug)
  HMODULE RlsCRTLibrary = GetModuleHandle(RlsCRTLibraryName);

#ifdef _DEBUG
  HMODULE DbgCRTLibrary = GetModuleHandle(DbgCRTLibraryName);
#endif

  HMODULE DefCRTLibrary = 
#ifdef _DEBUG
    DbgCRTLibrary? DbgCRTLibrary: 
#endif	
    RlsCRTLibrary;

  // assign function pointers for required CRT support functions
#if 1
  if (DefCRTLibrary)
    {
      hoard_memcpy_ptr = (void(*)(void*,const void*,size_t))
	GetProcAddress(DefCRTLibrary, "memcpy");
      hoard_memset_ptr = (void(*)(void*,int,size_t))
	GetProcAddress(DefCRTLibrary, "memset");
    }
#endif

  // patch all relevant Release CRT Library entry points
  unsigned i;
  bool patchedRls = false;
  if (RlsCRTLibrary)
    for (i = 0; i < sizeof(rls_patches) / sizeof(*rls_patches); i++)
      if (rls_patches[i].original = GetProcAddress(RlsCRTLibrary, rls_patches[i].import))
	{
	  PatchIt(&rls_patches[i]);
	  patchedRls = true;
	}

#ifdef _DEBUG
  // patch all relevant Debug CRT Library entry points
  bool patchedDbg = false;
  if (DbgCRTLibrary)
    for (i = 0; i < sizeof(dbg_patches) / sizeof(*dbg_patches); i++)
      if (dbg_patches[i].original = GetProcAddress(DbgCRTLibrary, dbg_patches[i].import))
	{
	  PatchIt(&dbg_patches[i]);
	  patchedDbg = true;
	}

  // no point in staying loaded if we didn't patch anything...
  return patchedRls || patchedDbg;
#else
  return patchedRls;
#endif
}

extern "C" 
{
  // This global data item is used by the app-linked obj to reference
  // winhoard.dll asap in the executable. We want winhoard.dll's
  // HoardDllMain to run first.

  __declspec(dllexport) int ReferenceHoardStub;

  BOOL WINAPI HoardDllMain (HANDLE hinstDLL,
			    DWORD fdwReason,
			    LPVOID lpreserved)
  {
    void * str;
    static int np = CPUInfo::computeNumProcessors();
    int r;
    switch (fdwReason)
      {
      case DLL_PROCESS_ATTACH:
	DisableThreadLibraryCalls ((HMODULE)hinstDLL);
	r = PatchMeIn();
	return TRUE;

      case DLL_PROCESS_DETACH:
	// Notice that we haven't replaced all heap calls! Here's one now.
	str = HeapAlloc (GetProcessHeap(), 0, 1);
	return TRUE;

      case DLL_THREAD_ATTACH:
	if (np == 1) {
	  // Assign the thread to heap 0.
	  getCustomHeap()->chooseZero();
	} else {
	  getCustomHeap()->findUnusedHeap();
	}
	// Reset the thread-local allocation buffer so it will get
	// properly initialized.
	tlab = NULL;
	return TRUE;
	break;
	
      case DLL_THREAD_DETACH:
	// Dump the memory from the TLAB.
	tlab->clear();
	if (np != 1) {
	  getCustomHeap()->releaseHeap();
	}
	return TRUE;
	break;

      default:
	return TRUE;
      }
    return TRUE;
  }

} // extern "C"
