8
0
mirror of https://github.com/FirebirdSQL/firebird.git synced 2025-01-25 00:03:03 +01:00
firebird-mirror/src/jrd/cch.cpp

5194 lines
136 KiB
C++

/*
* PROGRAM: JRD Access Method
* MODULE: cch.cpp
* DESCRIPTION: Disk cache manager
*
* The contents of this file are subject to the Interbase Public
* License Version 1.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy
* of the License at http://www.Inprise.com/IPL.html
*
* Software distributed under the License is distributed on an
* "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express
* or implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code was created by Inprise Corporation
* and its predecessors. Portions created by Inprise Corporation are
* Copyright (C) Inprise Corporation.
*
* All Rights Reserved.
* Contributor(s): ______________________________________.
* 2001.07.06 Sean Leyne - Code Cleanup, removed "#ifdef READONLY_DATABASE"
* conditionals, as the engine now fully supports
* readonly databases.
*
* 2002.10.29 Sean Leyne - Removed obsolete "Netware" port
*
*/
#include "firebird.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "../jrd/jrd.h"
#include "../jrd/que.h"
#include "../jrd/lck.h"
#include "../jrd/ods.h"
#include "../jrd/os/pio.h"
#include "../jrd/cch.h"
#include "gen/iberror.h"
#include "../jrd/lls.h"
#include "../jrd/sdw.h"
#include "../jrd/tra.h"
#include "../jrd/sbm.h"
#include "../jrd/nbak.h"
#include "../common/gdsassert.h"
#include "../jrd/cch_proto.h"
#include "../jrd/err_proto.h"
#include "../yvalve/gds_proto.h"
#include "../common/isc_proto.h"
#include "../common/isc_s_proto.h"
#include "../jrd/jrd_proto.h"
#include "../jrd/lck_proto.h"
#include "../jrd/pag_proto.h"
#include "../jrd/ods_proto.h"
#include "../jrd/os/pio_proto.h"
#include "../jrd/sdw_proto.h"
#include "../jrd/shut_proto.h"
#include "../common/ThreadStart.h"
#include "../jrd/tra_proto.h"
#include "../common/config/config.h"
#include "../common/classes/ClumpletWriter.h"
#include "../common/classes/MsgPrint.h"
#include "../jrd/CryptoManager.h"
#include "../common/utils_proto.h"
using namespace Jrd;
using namespace Ods;
using namespace Firebird;
// In the superserver mode, no page locks are acquired through the lock manager.
// Instead, a latching mechanism is used. So the calls to lock subsystem for
// database pages in the original code should not be made, lest they should cause
// any undesirable side-effects. The following defines help us achieve that.
#ifdef CCH_DEBUG
#include <stdarg.h>
IMPLEMENT_TRACE_ROUTINE(cch_trace, "CCH")
#endif
#ifdef SUPERSERVER_V2
#define CACHE_READER
#endif
static inline void PAGE_LOCK_RELEASE(thread_db* tdbb, BufferControl* bcb, Lock* lock)
{
if (!(bcb->bcb_flags & BCB_exclusive))
{
CCH_TRACE(("LCK RLS %" SQUADFORMAT, lock->getKey()));
LCK_release(tdbb, lock);
}
}
static inline void PAGE_LOCK_ASSERT(thread_db* tdbb, BufferControl* bcb, Lock* lock)
{
if (!(bcb->bcb_flags & BCB_exclusive))
LCK_assert(tdbb, lock);
}
static inline void PAGE_LOCK_RE_POST(thread_db* tdbb, BufferControl* bcb, Lock* lock)
{
if (!(bcb->bcb_flags & BCB_exclusive))
{
CCH_TRACE(("LCK REP %" SQUADFORMAT, lock->getKey()));
LCK_re_post(tdbb, lock);
}
}
#define PAGE_OVERHEAD (sizeof(bcb_repeat) + sizeof(BufferDesc) + sizeof(Lock) + (int) bcb->bcb_page_size)
enum LatchState
{
lsOk,
lsTimeout,
lsPageChanged
};
static void adjust_scan_count(WIN* window, bool mustRead);
static BufferDesc* alloc_bdb(thread_db*, BufferControl*, UCHAR **);
static Lock* alloc_page_lock(Jrd::thread_db*, BufferDesc*);
static int blocking_ast_bdb(void*);
#ifdef CACHE_READER
static void prefetch_epilogue(Prefetch*, FbStatusVector *);
static void prefetch_init(Prefetch*, thread_db*);
static void prefetch_io(Prefetch*, FbStatusVector *);
static void prefetch_prologue(Prefetch*, SLONG *);
#endif
static void check_precedence(thread_db*, WIN*, PageNumber);
static void clear_precedence(thread_db*, BufferDesc*);
static BufferDesc* dealloc_bdb(BufferDesc*);
static void down_grade(thread_db*, BufferDesc*, int high = 0);
static bool expand_buffers(thread_db*, ULONG);
static BufferDesc* find_buffer(BufferControl* bcb, const PageNumber page, bool findPending);
static BufferDesc* get_buffer(thread_db*, const PageNumber, SyncType, int);
static int get_related(BufferDesc*, PagesArray&, int, const ULONG);
static ULONG get_prec_walk_mark(BufferControl*);
static LatchState latch_buffer(thread_db*, Sync&, BufferDesc*, const PageNumber, SyncType, int);
static LockState lock_buffer(thread_db*, BufferDesc*, const SSHORT, const SCHAR);
static ULONG memory_init(thread_db*, BufferControl*, SLONG);
static void page_validation_error(thread_db*, win*, SSHORT);
static SSHORT related(BufferDesc*, const BufferDesc*, SSHORT, const ULONG);
static bool writeable(BufferDesc*);
static bool is_writeable(BufferDesc*, const ULONG);
static int write_buffer(thread_db*, BufferDesc*, const PageNumber, const bool, FbStatusVector* const,
const bool);
static bool write_page(thread_db*, BufferDesc*, FbStatusVector* const, const bool);
static bool set_diff_page(thread_db*, BufferDesc*);
static void clear_dirty_flag_and_nbak_state(thread_db*, BufferDesc*);
static inline void insertDirty(BufferControl* bcb, BufferDesc* bdb)
{
if (bdb->bdb_dirty.que_forward != &bdb->bdb_dirty)
return;
Sync dirtySync(&bcb->bcb_syncDirtyBdbs, "insertDirty");
dirtySync.lock(SYNC_EXCLUSIVE);
if (bdb->bdb_dirty.que_forward != &bdb->bdb_dirty)
return;
bcb->bcb_dirty_count++;
QUE_INSERT(bcb->bcb_dirty, bdb->bdb_dirty);
}
static inline void removeDirty(BufferControl* bcb, BufferDesc* bdb)
{
if (bdb->bdb_dirty.que_forward == &bdb->bdb_dirty)
return;
Sync dirtySync(&bcb->bcb_syncDirtyBdbs, "removeDirty");
dirtySync.lock(SYNC_EXCLUSIVE);
if (bdb->bdb_dirty.que_forward == &bdb->bdb_dirty)
return;
fb_assert(bcb->bcb_dirty_count > 0);
bcb->bcb_dirty_count--;
QUE_DELETE(bdb->bdb_dirty);
QUE_INIT(bdb->bdb_dirty);
}
static void flushDirty(thread_db* tdbb, SLONG transaction_mask, const bool sys_only);
static void flushAll(thread_db* tdbb, USHORT flush_flag);
static void flushPages(thread_db* tdbb, USHORT flush_flag, BufferDesc** begin, FB_SIZE_T count);
static void recentlyUsed(BufferDesc* bdb);
static void requeueRecentlyUsed(BufferControl* bcb);
const ULONG MIN_BUFFER_SEGMENT = 65536;
// Given pointer a field in the block, find the block
#define BLOCK(fld_ptr, type, fld) (type*)((SCHAR*) fld_ptr - offsetof(type, fld))
const PageNumber FREE_PAGE(DB_PAGE_SPACE, -1);
const int PRE_SEARCH_LIMIT = 256;
const int PRE_EXISTS = -1;
const int PRE_UNKNOWN = -2;
int CCH_down_grade_dbb(void* ast_object)
{
/**************************************
*
* C C H _ d o w n _ g r a d e _ d b b
*
**************************************
*
* Functional description
* Down grade the lock on the database in response to a blocking
* AST.
*
**************************************/
Database* dbb = static_cast<Database*>(ast_object);
try
{
Lock* const lock = dbb->dbb_lock;
AsyncContextHolder tdbb(dbb, FB_FUNCTION);
SyncLockGuard dsGuard(&dbb->dbb_sync, SYNC_EXCLUSIVE, "CCH_down_grade_dbb");
dbb->dbb_ast_flags |= DBB_blocking;
// Process the database shutdown request, if any
if (SHUT_blocking_ast(tdbb, true))
return 0;
// If we are already shared, there is nothing more we can do.
// If any case, the other guy probably wants exclusive access,
// and we can't give it anyway
if ((lock->lck_logical == LCK_SW) || (lock->lck_logical == LCK_SR))
{
// Fake conversion to the same level as we already own.
// This trick re-enables the AST delivery.
LCK_convert(tdbb, lock, lock->lck_logical, LCK_NO_WAIT);
return 0;
}
if (dbb->dbb_flags & DBB_bugcheck)
{
LCK_convert(tdbb, lock, LCK_SW, LCK_WAIT);
dbb->dbb_ast_flags &= ~DBB_blocking;
return 0;
}
// If we are supposed to be exclusive, stay exclusive
if ((dbb->dbb_flags & DBB_exclusive) || (dbb->dbb_ast_flags & DBB_shutdown_single))
return 0;
// Assert any page locks that have been requested, but not asserted
dbb->dbb_ast_flags |= DBB_assert_locks;
BufferControl* bcb = dbb->dbb_bcb;
if (bcb)
{
SyncLockGuard bcbSync(&bcb->bcb_syncObject, SYNC_EXCLUSIVE, "CCH_down_grade_dbb");
bcb->bcb_flags &= ~BCB_exclusive;
if (bcb->bcb_count)
{
const bcb_repeat* tail = bcb->bcb_rpt;
fb_assert(tail); // once I've got here with NULL. AP.
for (const bcb_repeat* const end = tail + bcb->bcb_count; tail < end; ++tail)
PAGE_LOCK_ASSERT(tdbb, bcb, tail->bcb_bdb->bdb_lock);
}
}
// Down grade the lock on the database itself
if (lock->lck_physical == LCK_EX)
LCK_convert(tdbb, lock, LCK_PW, LCK_WAIT); // This lets waiting cache manager in first
else if (lock->lck_physical == LCK_PW)
LCK_convert(tdbb, lock, LCK_SW, LCK_WAIT);
else
fb_assert(lock->lck_physical == 0);
dbb->dbb_ast_flags &= ~DBB_blocking;
}
catch (const Firebird::Exception&)
{} // no-op
return 0;
}
bool CCH_exclusive(thread_db* tdbb, USHORT level, SSHORT wait_flag, Firebird::Sync* guard)
{
/**************************************
*
* C C H _ e x c l u s i v e
*
**************************************
*
* Functional description
* Get exclusive access to a database. If we get it, return true.
* If the wait flag is FALSE, and we can't get it, give up and
* return false. There are two levels of database exclusivity: LCK_PW
* guarantees there are no normal users in the database while LCK_EX
* additionally guarantees background database processes like the
* shared cache manager have detached.
*
**************************************/
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
if (dbb->dbb_flags & DBB_shared)
{
if (!CCH_exclusive_attachment(tdbb, level, wait_flag, guard))
return false;
}
Lock* lock = dbb->dbb_lock;
if (!lock)
return false;
dbb->dbb_flags |= DBB_exclusive;
switch (level)
{
case LCK_PW:
if (lock->lck_physical >= LCK_PW || LCK_convert(tdbb, lock, LCK_PW, wait_flag))
return true;
break;
case LCK_EX:
if (lock->lck_physical == LCK_EX || LCK_convert(tdbb, lock, LCK_EX, wait_flag))
return true;
break;
default:
break;
}
// Clear the status vector, as our callers check the return value
// and throw custom exceptions themselves
fb_utils::init_status(tdbb->tdbb_status_vector);
// If we are supposed to wait (presumably patiently),
// but can't get the lock, generate an error
if (wait_flag == LCK_WAIT)
ERR_post(Arg::Gds(isc_deadlock));
dbb->dbb_flags &= ~DBB_exclusive;
return false;
}
bool CCH_exclusive_attachment(thread_db* tdbb, USHORT level, SSHORT wait_flag, Sync* exGuard)
{
/**************************************
*
* C C H _ e x c l u s i v e _ a t t a c h m e n t
*
**************************************
*
* Functional description
* Get exclusive access to a database. If we get it, return true.
* If the wait flag is FALSE, and we can't get it, give up and
* return false.
*
**************************************/
const int CCH_EXCLUSIVE_RETRY_INTERVAL = 1; // retry interval in seconds
SET_TDBB(tdbb);
Database* const dbb = tdbb->getDatabase();
Sync dsGuard(&dbb->dbb_sync, FB_FUNCTION);
const bool exLock = dbb->dbb_sync.ourExclusiveLock();
if (!exLock)
dsGuard.lock(level != LCK_none ? SYNC_EXCLUSIVE : SYNC_SHARED);
else
fb_assert(exGuard);
Jrd::Attachment* const attachment = tdbb->getAttachment();
if (attachment->att_flags & ATT_exclusive)
return true;
attachment->att_flags |= (level == LCK_none) ? ATT_attach_pending : ATT_exclusive_pending;
const SLONG timeout = (wait_flag == LCK_WAIT) ? 1L << 30 : -wait_flag;
// If requesting exclusive database access, then re-position attachment as the
// youngest so that pending attachments may pass.
if (level != LCK_none)
{
for (Jrd::Attachment** ptr = &dbb->dbb_attachments; *ptr; ptr = &(*ptr)->att_next)
{
if (*ptr == attachment)
{
*ptr = attachment->att_next;
break;
}
}
attachment->att_next = dbb->dbb_attachments;
dbb->dbb_attachments = attachment;
if (!exLock)
dsGuard.downgrade(SYNC_SHARED);
}
for (SLONG remaining = timeout; remaining >= 0; remaining -= CCH_EXCLUSIVE_RETRY_INTERVAL)
{
try
{
tdbb->checkCancelState(true);
bool found = false;
for (Jrd::Attachment* other_attachment = attachment->att_next; other_attachment;
other_attachment = other_attachment->att_next)
{
if (level == LCK_none)
{
// Wait for other attachments requesting exclusive access
if (other_attachment->att_flags & (ATT_exclusive | ATT_exclusive_pending))
{
found = true;
break;
}
// Forbid multiple attachments in single-user maintenance mode
if (other_attachment != attachment &&
(dbb->dbb_ast_flags & DBB_shutdown_single))
{
found = true;
break;
}
}
else
{
// Requesting exclusive database access
found = true;
if (other_attachment->att_flags & ATT_exclusive_pending)
{
if (wait_flag == LCK_WAIT)
ERR_post(Arg::Gds(isc_deadlock));
attachment->att_flags &= ~ATT_exclusive_pending;
return false;
}
break;
}
}
if (!found)
{
if (level != LCK_none)
attachment->att_flags |= ATT_exclusive;
attachment->att_flags &= ~(ATT_exclusive_pending | ATT_attach_pending);
return true;
}
// Our thread needs to sleep for CCH_EXCLUSIVE_RETRY_INTERVAL seconds.
if (remaining >= CCH_EXCLUSIVE_RETRY_INTERVAL)
{
SyncUnlockGuard unlock(exLock ? (*exGuard) : dsGuard);
Thread::sleep(CCH_EXCLUSIVE_RETRY_INTERVAL * 1000);
}
} // try
catch (const Exception&)
{
attachment->att_flags &= ~(ATT_exclusive_pending | ATT_attach_pending);
throw;
}
}
attachment->att_flags &= ~(ATT_exclusive_pending | ATT_attach_pending);
return false;
}
bool CCH_expand(thread_db* tdbb, ULONG number)
{
/**************************************
*
* C C H _ e x p a n d
*
**************************************
*
* Functional description
* Expand the cache to at least a given number of buffers. If
* it's already that big, don't do anything.
*
**************************************/
SET_TDBB(tdbb);
return expand_buffers(tdbb, number);
}
pag* CCH_fake(thread_db* tdbb, WIN* window, int wait)
{
/**************************************
*
* C C H _ f a k e
*
**************************************
*
* Functional description
* Fake a fetch to a page. Rather than reading it, however,
* zero it in memory. This is used when allocating a new page.
*
* input
* wait: 1 => Wait as long as necessary to get the latch.
* This can cause deadlocks of course.
* 0 => If the latch can't be acquired immediately,
* or an IO would be necessary, then give
* up and return 0.
* <negative number> => Latch timeout interval in seconds.
*
* return
* pag pointer if successful.
* NULL pointer if timeout occurred (only possible if wait <> 1).
* NULL pointer if wait=0 and the faked page would have to be
* before reuse.
*
**************************************/
SET_TDBB(tdbb);
Database* const dbb = tdbb->getDatabase();
BufferControl *bcb = dbb->dbb_bcb;
CCH_TRACE(("FAKE %d:%06d", window->win_page.getPageSpaceID(), window->win_page.getPageNum()));
// if there has been a shadow added recently, go out and
// find it before we grant any more write locks
if (dbb->dbb_ast_flags & DBB_get_shadows)
SDW_get_shadows(tdbb);
BufferDesc* bdb = get_buffer(tdbb, window->win_page, SYNC_EXCLUSIVE, wait);
if (!bdb)
return NULL; // latch timeout occurred
// If a dirty orphaned page is being reused - better write it first
// to clear current precedences and checkpoint state. This would also
// update the bcb_free_pages field appropriately.
if (bdb->bdb_flags & (BDB_dirty | BDB_db_dirty))
{
// If the caller didn't want to wait at all, then
// return 'try to fake an other page' to the caller.
if (!wait)
{
bdb->release(tdbb, true);
return NULL;
}
if (!write_buffer(tdbb, bdb, bdb->bdb_page, true, tdbb->tdbb_status_vector, true))
{
CCH_unwind(tdbb, true);
}
}
else if (QUE_NOT_EMPTY(bdb->bdb_lower))
{
// Clear residual precedence left over from AST-level I/O.
Sync syncPrec(&bcb->bcb_syncPrecedence, "CCH_fake");
syncPrec.lock(SYNC_EXCLUSIVE);
clear_precedence(tdbb, bdb);
}
// Here the page must not be dirty and have no backup lock owner
fb_assert((bdb->bdb_flags & (BDB_dirty | BDB_db_dirty)) == 0);
bdb->bdb_flags &= BDB_lru_chained; // yes, clear all except BDB_lru_chained
bdb->bdb_flags |= (BDB_writer | BDB_faked);
bdb->bdb_scan_count = 0;
if (!(bcb->bcb_flags & BCB_exclusive))
lock_buffer(tdbb, bdb, LCK_WAIT, pag_undefined);
MOVE_CLEAR(bdb->bdb_buffer, (SLONG) dbb->dbb_page_size);
window->win_buffer = bdb->bdb_buffer;
window->win_bdb = bdb;
window->win_flags = 0;
CCH_MARK(tdbb, window);
return bdb->bdb_buffer;
}
pag* CCH_fetch(thread_db* tdbb, WIN* window, int lock_type, SCHAR page_type, int wait,
const bool read_shadow)
{
/**************************************
*
* C C H _ f e t c h
*
**************************************
*
* Functional description
* Fetch a specific page. If it's already in cache,
* so much the better.
*
* input
* wait: 1 => Wait as long as necessary to get the latch.
* This can cause deadlocks of course.
* 0 => If the latch can't be acquired immediately,
* give up and return 0.
* <negative number> => Latch timeout interval in seconds.
*
* return
* PAG if successful.
* NULL pointer if timeout occurred (only possible if wait <> 1).
*
**************************************/
SET_TDBB(tdbb);
const LockState lockState = CCH_fetch_lock(tdbb, window, lock_type, wait, page_type);
BufferDesc* bdb = window->win_bdb;
SyncType syncType = (lock_type >= LCK_write) ? SYNC_EXCLUSIVE : SYNC_SHARED;
switch (lockState)
{
case lsLocked:
CCH_TRACE(("FE PAGE %d:%06d", window->win_page.getPageSpaceID(), window->win_page.getPageNum()));
CCH_fetch_page(tdbb, window, read_shadow); // must read page from disk
if (syncType != SYNC_EXCLUSIVE)
bdb->downgrade(syncType);
break;
case lsLatchTimeout:
case lsLockTimeout:
return NULL; // latch or lock timeout
}
adjust_scan_count(window, lockState == lsLocked);
// Validate the fetched page matches the expected type
if (bdb->bdb_buffer->pag_type != page_type && page_type != pag_undefined)
page_validation_error(tdbb, window, page_type);
return window->win_buffer;
}
LockState CCH_fetch_lock(thread_db* tdbb, WIN* window, int lock_type, int wait, SCHAR page_type)
{
/**************************************
*
* C C H _ f e t c h _ l o c k
*
**************************************
*
* Functional description
* Fetch a latch and lock for a specific page.
*
* input
*
* wait:
* LCK_WAIT (1) => Wait as long a necessary to get the lock.
* This can cause deadlocks of course.
*
* LCK_NO_WAIT (0) =>
* If the latch can't be acquired immediately, give up and return -2.
* If the lock can't be acquired immediately, give up and return -1.
*
* <negative number> => Lock (latch) timeout interval in seconds.
*
* return
* 0: fetch & lock were successful, page doesn't need to be read.
* 1: fetch & lock were successful, page needs to be read from disk.
* -1: lock timed out, fetch failed.
* -2: latch timed out, fetch failed, lock not attempted.
*
**************************************/
SET_TDBB(tdbb);
Database* const dbb = tdbb->getDatabase();
BufferControl* const bcb = dbb->dbb_bcb;
// if there has been a shadow added recently, go out and
// find it before we grant any more write locks
if (dbb->dbb_ast_flags & DBB_get_shadows)
SDW_get_shadows(tdbb);
// Look for the page in the cache.
BufferDesc* bdb = get_buffer(tdbb, window->win_page,
((lock_type >= LCK_write) ? SYNC_EXCLUSIVE : SYNC_SHARED), wait);
if (wait != 1 && bdb == 0)
return lsLatchTimeout; // latch timeout
if (lock_type >= LCK_write)
bdb->bdb_flags |= BDB_writer;
window->win_bdb = bdb;
window->win_buffer = bdb->bdb_buffer;
if (bcb->bcb_flags & BCB_exclusive)
return (bdb->bdb_flags & BDB_read_pending) ? lsLocked : lsLockedHavePage;
// lock_buffer returns 0 or 1 or -1.
const LockState lock_result = lock_buffer(tdbb, bdb, wait, page_type);
return lock_result;
}
void CCH_fetch_page(thread_db* tdbb, WIN* window, const bool read_shadow)
{
/**************************************
*
* C C H _ f e t c h _ p a g e
*
**************************************
*
* Functional description
* Fetch a specific page. If it's already in cache,
* so much the better. When "compute_checksum" is 1, compute
* the checksum of the page. When it is 2, compute
* the checksum only when the page type is nonzero.
*
**************************************/
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
BufferDesc* bdb = window->win_bdb;
BufferControl* bcb = bdb->bdb_bcb;
FbStatusVector* const status = tdbb->tdbb_status_vector;
pag* page = bdb->bdb_buffer;
bdb->bdb_incarnation = ++bcb->bcb_page_incarnation;
tdbb->bumpStats(RuntimeStatistics::PAGE_READS);
PageSpace* pageSpace = dbb->dbb_page_manager.findPageSpace(bdb->bdb_page.getPageSpaceID());
fb_assert(pageSpace);
jrd_file* file = pageSpace->file;
const bool isTempPage = pageSpace->isTemporary();
/*
We will read a page, and if there is an I/O error we will try to
use the shadow file, and try reading again, for a maximum of
3 tries, before it gives up.
The read_shadow flag is set to false only in the call to
FETCH_NO_SHADOW, which is only called from validate
code.
read_shadow = false -> IF an I/O error occurs give up (exit
the loop, clean up, and return). So the caller,
validate in most cases, can know about it and attempt
to remedy the situation.
read_shadow = true -> IF an I/O error occurs attempt
to rollover to the shadow file. If the I/O error is
persistant (more than 3 times) error out of the routine by
calling CCH_unwind, and eventually punting out.
*/
class Pio : public CryptoManager::IOCallback
{
public:
Pio(jrd_file* f, BufferDesc* b, bool tp, bool rs, PageSpace* ps)
: file(f), bdb(b), isTempPage(tp),
read_shadow(rs), pageSpace(ps)
{ }
bool callback(thread_db* tdbb, FbStatusVector* status, Ods::pag* page)
{
Database *dbb = tdbb->getDatabase();
int retryCount = 0;
while (!PIO_read(tdbb, file, bdb, page, status))
{
if (isTempPage || !read_shadow)
return false;
if (!CCH_rollover_to_shadow(tdbb, dbb, file, false))
return false;
if (file != pageSpace->file)
file = pageSpace->file;
else
{
if (retryCount++ == 3)
{
gds__log("IO error loop Unwind to avoid a hang\n");
return false;
}
}
}
return true;
}
private:
jrd_file* file;
BufferDesc* bdb;
bool isTempPage;
bool read_shadow;
PageSpace* pageSpace;
};
BackupManager* bm = dbb->dbb_backup_manager;
BackupManager::StateReadGuard stateGuard(tdbb);
const int bak_state = bm->getState();
fb_assert(bak_state != Ods::hdr_nbak_unknown);
ULONG diff_page = 0;
if (!isTempPage && bak_state != Ods::hdr_nbak_normal)
{
diff_page = bm->getPageIndex(tdbb, bdb->bdb_page.getPageNum());
NBAK_TRACE(("Reading page %d:%06d, state=%d, diff page=%d",
bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum(), bak_state, diff_page));
}
// In merge mode, if we are reading past beyond old end of file and page is in .delta file
// then we maintain actual page in difference file. Always read it from there.
if (isTempPage || bak_state == Ods::hdr_nbak_normal || !diff_page)
{
NBAK_TRACE(("Reading page %d:%06d, state=%d, diff page=%d from DISK",
bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum(), bak_state, diff_page));
// Read page from disk as normal
Pio io(file, bdb, isTempPage, read_shadow, pageSpace);
if (!dbb->dbb_crypto_manager->read(tdbb, status, page, &io))
{
if (read_shadow && !isTempPage)
{
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
CCH_unwind(tdbb, true);
}
}
}
else
{
NBAK_TRACE(("Reading page %d, state=%d, diff page=%d from DIFFERENCE",
bdb->bdb_page, bak_state, diff_page));
if (!bm->readDifference(tdbb, diff_page, page))
{
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
CCH_unwind(tdbb, true);
}
if (page->pag_type == 0 && page->pag_generation == 0 && page->pag_scn == 0)
{
// We encountered a page which was allocated, but never written to the
// difference file. In this case we try to read the page from database. With
// this approach if the page was old we get it from DISK, and if the page
// was new IO error (EOF) or BUGCHECK (checksum error) will be the result.
// Engine is not supposed to read a page which was never written unless
// this is a merge process.
NBAK_TRACE(("Re-reading page %d, state=%d, diff page=%d from DISK",
bdb->bdb_page, bak_state, diff_page));
Pio io(file, bdb, false, read_shadow, pageSpace);
if (!dbb->dbb_crypto_manager->read(tdbb, status, page, &io))
{
if (read_shadow)
{
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
CCH_unwind(tdbb, true);
}
}
}
}
bdb->bdb_flags &= ~(BDB_not_valid | BDB_read_pending);
window->win_buffer = bdb->bdb_buffer;
}
void CCH_forget_page(thread_db* tdbb, WIN* window)
{
/**************************************
*
* C C H _ f o r g e t _ p a g e
*
**************************************
*
* Functional description
* Page was faked but can't be written on disk. Most probably because
* of out of disk space. Release page buffer and others resources and
* unlink page from various queues
*
**************************************/
SET_TDBB(tdbb);
BufferDesc* bdb = window->win_bdb;
Database* dbb = tdbb->getDatabase();
if (window->win_page != bdb->bdb_page || bdb->bdb_buffer->pag_type != pag_undefined)
return; // buffer was reassigned or page was reused
window->win_bdb = NULL;
if (bdb->bdb_flags & BDB_io_error)
dbb->dbb_flags &= ~DBB_suspend_bgio;
clear_dirty_flag_and_nbak_state(tdbb, bdb);
bdb->bdb_flags = 0;
BufferControl* bcb = dbb->dbb_bcb;
removeDirty(bcb, bdb);
QUE_DELETE(bdb->bdb_in_use);
QUE_DELETE(bdb->bdb_que);
QUE_INSERT(bcb->bcb_empty, bdb->bdb_que);
if (tdbb->tdbb_flags & TDBB_no_cache_unwind)
bdb->release(tdbb, true);
}
void CCH_fini(thread_db* tdbb)
{
/**************************************
*
* C C H _ f i n i
*
**************************************
*
* Functional description
* Shut down buffer operation.
*
**************************************/
SET_TDBB(tdbb);
Database* const dbb = tdbb->getDatabase();
BufferControl* const bcb = dbb->dbb_bcb;
if (!bcb)
return;
bcb_repeat* tail = bcb->bcb_rpt;
const bcb_repeat* const end = tail + bcb->bcb_count;
for (; tail < end; tail++)
{
if (tail->bcb_bdb)
{
delete tail->bcb_bdb;
tail->bcb_bdb = NULL;
}
}
delete[] bcb->bcb_rpt;
bcb->bcb_rpt = NULL;
bcb->bcb_count = 0;
while (bcb->bcb_memory.hasData())
bcb->bcb_bufferpool->deallocate(bcb->bcb_memory.pop());
BufferControl::destroy(bcb);
dbb->dbb_bcb = NULL;
}
void CCH_flush(thread_db* tdbb, USHORT flush_flag, TraNumber tra_number)
{
/**************************************
*
* C C H _ f l u s h
*
**************************************
*
* Functional description
* Flush all buffers. If the release flag is set,
* release all locks.
*
**************************************/
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
// note that some of the code for btc_flush()
// replicates code in the for loop
// to minimize call overhead -- changes should be made in both places
if (flush_flag & (FLUSH_TRAN | FLUSH_SYSTEM))
{
const ULONG transaction_mask = tra_number ? 1L << (tra_number & (BITS_PER_LONG - 1)) : 0;
bool sys_only = false;
if (!transaction_mask && (flush_flag & FLUSH_SYSTEM))
sys_only = true;
#ifdef SUPERSERVER_V2
BufferControl* bcb = dbb->dbb_bcb;
//if (!dbb->dbb_wal && A && B) becomes
//if (true && A && B) then finally (A && B)
if (!(dbb->dbb_flags & DBB_force_write) && transaction_mask)
{
dbb->dbb_flush_cycle |= transaction_mask;
if (!(bcb->bcb_flags & BCB_writer_active))
bcb->bcb_writer_sem.release();
}
else
#endif
flushDirty(tdbb, transaction_mask, sys_only);
}
else
flushAll(tdbb, flush_flag);
//
// Check if flush needed
//
const int max_unflushed_writes = dbb->dbb_config->getMaxUnflushedWrites();
const time_t max_unflushed_write_time = dbb->dbb_config->getMaxUnflushedWriteTime();
bool max_num = (max_unflushed_writes >= 0);
bool max_time = (max_unflushed_write_time >= 0);
bool doFlush = false;
PageSpace* pageSpaceID = dbb->dbb_page_manager.findPageSpace(DB_PAGE_SPACE);
jrd_file* main_file = pageSpaceID->file;
// Avoid flush while creating and restoring database
const Jrd::Attachment* att = tdbb->getAttachment();
const bool dontFlush = (dbb->dbb_flags & DBB_creating) ||
(dbb->dbb_ast_flags & DBB_shutdown_single) &&
att && (att->att_flags & (ATT_creator | ATT_system));
if (!(main_file->fil_flags & FIL_force_write) && (max_num || max_time) && !dontFlush)
{
const time_t now = time(0);
SyncLockGuard guard(&dbb->dbb_flush_count_mutex, SYNC_EXCLUSIVE, "CCH_flush");
// If this is the first commit set last_flushed_write to now
if (!dbb->last_flushed_write)
dbb->last_flushed_write = now;
const bool forceFlush = (flush_flag & FLUSH_ALL);
// test max_num condition and max_time condition
max_num = max_num && (dbb->unflushed_writes == max_unflushed_writes);
max_time = max_time && (now - dbb->last_flushed_write > max_unflushed_write_time);
if (forceFlush || max_num || max_time)
{
doFlush = true;
dbb->unflushed_writes = 0;
dbb->last_flushed_write = now;
}
else
{
dbb->unflushed_writes++;
}
}
if (doFlush)
{
PIO_flush(tdbb, main_file);
for (Shadow* shadow = dbb->dbb_shadow; shadow; shadow = shadow->sdw_next)
PIO_flush(tdbb, shadow->sdw_file);
BackupManager* bm = dbb->dbb_backup_manager;
if (!bm->isShutDown())
{
BackupManager::StateReadGuard stateGuard(tdbb);
const int backup_state = bm->getState();
if (backup_state == Ods::hdr_nbak_stalled || backup_state == Ods::hdr_nbak_merge)
bm->flushDifference(tdbb);
}
}
// take the opportunity when we know there are no pages
// in cache to check that the shadow(s) have not been
// scheduled for shutdown or deletion
SDW_check(tdbb);
}
void CCH_flush_ast(thread_db* tdbb)
{
/**************************************
*
* C C H _ f l u s h _ a s t
*
**************************************
*
* Functional description
* Flush all buffers coming from database file.
* Should be called from AST
*
**************************************/
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = dbb->dbb_bcb;
if (bcb->bcb_flags & BCB_exclusive)
CCH_flush(tdbb, FLUSH_ALL, 0);
else
{
// Do some fancy footwork to make sure that pages are
// not removed from the btc tree at AST level. Then
// restore the flag to whatever it was before.
const bool keep_pages = bcb->bcb_flags & BCB_keep_pages;
bcb->bcb_flags |= BCB_keep_pages;
for (ULONG i = 0; (bcb = dbb->dbb_bcb) && i < bcb->bcb_count; i++)
{
BufferDesc* bdb = bcb->bcb_rpt[i].bcb_bdb;
if (bdb->bdb_flags & (BDB_dirty | BDB_db_dirty))
down_grade(tdbb, bdb, 1);
}
if (!keep_pages)
bcb->bcb_flags &= ~BCB_keep_pages;
}
}
bool CCH_free_page(thread_db* tdbb)
{
/**************************************
*
* C C H _ f r e e _ p a g e
*
**************************************
*
* Functional description
* Check if the cache is below its free pages
* threshold and write a page on the LRU tail.
*
**************************************/
// Called by VIO/garbage_collector() when it is idle to
// help quench the thirst for free pages.
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = dbb->dbb_bcb;
if (dbb->readOnly())
return false;
BufferDesc* bdb;
if ((bcb->bcb_flags & BCB_free_pending) &&
(bdb = get_buffer(tdbb, FREE_PAGE, SYNC_NONE, 1)))
{
if (write_buffer(tdbb, bdb, bdb->bdb_page, true, tdbb->tdbb_status_vector, true))
return true;
CCH_unwind(tdbb, false);
}
return false;
}
SLONG CCH_get_incarnation(WIN* window)
{
/**************************************
*
* C C H _ g e t _ i n c a r n a t i o n
*
**************************************
*
* Functional description
* Get page incarnation associated with buffer.
*
**************************************/
return window->win_bdb->bdb_incarnation;
}
void CCH_get_related(thread_db* tdbb, PageNumber page, PagesArray &lowPages)
{
/**************************************
*
* C C H _ g e t _ r e l a t e d
*
**************************************
*
* Functional description
* Collect all pages, dependent on given page (i.e. all pages which must be
* written after given page). To do it, walk low part of precedence graph
* starting from given page and put its numbers into array.
*
**************************************/
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = dbb->dbb_bcb;
Sync bcbSync(&bcb->bcb_syncObject, "CCH_get_related");
bcbSync.lock(SYNC_SHARED);
BufferDesc* bdb = find_buffer(bcb, page, false);
bcbSync.unlock();
if (bdb)
{
Sync precSync(&bcb->bcb_syncPrecedence, "CCH_get_related");
precSync.lock(SYNC_EXCLUSIVE);
const ULONG mark = get_prec_walk_mark(bcb);
get_related(bdb, lowPages, PRE_SEARCH_LIMIT, mark);
}
}
pag* CCH_handoff(thread_db* tdbb, WIN* window, ULONG page, int lock, SCHAR page_type,
int wait, const bool release_tail)
{
/**************************************
*
* C C H _ h a n d o f f
*
**************************************
*
* Functional description
* Follow a pointer handing off the lock. Fetch the new page
* before retiring the old page lock.
*
* input
* wait: 1 => Wait as long as necessary to get the latch.
* This can cause deadlocks of course.
* 0 => If the latch can't be acquired immediately,
* give up and return 0.
* <negative number> => Latch timeout interval in seconds.
*
* return
* PAG if successful.
* 0 if a latch timeout occurred (only possible if wait <> 1).
* The latch on the fetched page is downgraded to shared.
* The fetched page is unmarked.
*
**************************************/
// The update, if there was one, of the input page is complete.
// The cache buffer can be 'unmarked'. It is important to
// unmark before CCH_unwind is (might be) called.
SET_TDBB(tdbb);
CCH_TRACE(("HANDOFF %d:%06d->%06d, %s",
window->win_page.getPageSpaceID(), window->win_page.getPageNum(), page, (lock >= LCK_write) ? "EX" : "SH"));
BufferDesc *bdb = window->win_bdb;
// unmark
if (bdb->bdb_writers == 1 && (bdb->bdb_flags & BDB_marked))
{
bdb->bdb_flags &= ~BDB_marked;
bdb->unLockIO(tdbb);
}
// If the 'from-page' and 'to-page' of the handoff are the
// same and the latch requested is shared then downgrade it.
if ((window->win_page.getPageNum() == page) && (lock == LCK_read))
{
if (bdb->ourExclusiveLock())
bdb->downgrade(SYNC_SHARED);
return window->win_buffer;
}
WIN temp = *window;
window->win_page = PageNumber(window->win_page.getPageSpaceID(), page);
// This prevents a deadlock with the precedence queue, as shown by
// mwrite mwrite1 2 mwrite2 2 test.fdb
const int wait2 = bdb->ourExclusiveLock() ? LCK_NO_WAIT : wait;
LockState must_read = CCH_fetch_lock(tdbb, window, lock, wait2, page_type);
if ((must_read == lsLatchTimeout || must_read == lsLockTimeout) && wait2 == LCK_NO_WAIT)
{
temp.win_bdb->downgrade(SYNC_SHARED);
must_read = CCH_fetch_lock(tdbb, window, lock, wait, page_type);
}
// Latch or lock timeout, return failure.
if (must_read == lsLatchTimeout || must_read == lsLockTimeout)
{
*window = temp;
CCH_RELEASE(tdbb, window);
return NULL;
}
if (release_tail)
CCH_RELEASE_TAIL(tdbb, &temp);
else
CCH_RELEASE(tdbb, &temp);
if (must_read != lsLockedHavePage)
CCH_fetch_page(tdbb, window, true);
bdb = window->win_bdb;
if (lock != LCK_write && must_read != lsLockedHavePage)
{
if (bdb->ourExclusiveLock())
bdb->downgrade(SYNC_SHARED);
}
adjust_scan_count(window, must_read == lsLocked);
// Validate the fetched page matches the expected type
if (bdb->bdb_buffer->pag_type != page_type && page_type != pag_undefined)
page_validation_error(tdbb, window, page_type);
return window->win_buffer;
}
void CCH_init(thread_db* tdbb, ULONG number)
{
/**************************************
*
* C C H _ i n i t
*
**************************************
*
* Functional description
* Initialize the cache. Allocate buffers control block,
* buffer descriptors, and actual buffers.
*
**************************************/
SET_TDBB(tdbb);
Database* const dbb = tdbb->getDatabase();
const bool shared = (dbb->dbb_flags & DBB_shared);
CCH_TRACE(("INIT %s", dbb->dbb_filename.c_str()));
// Check for database-specific page buffers
if (dbb->dbb_page_buffers)
number = dbb->dbb_page_buffers;
// Enforce page buffer cache constraints
if (number < MIN_PAGE_BUFFERS)
number = MIN_PAGE_BUFFERS;
if (number > MAX_PAGE_BUFFERS)
number = MAX_PAGE_BUFFERS;
const SLONG count = number;
// Allocate and initialize buffers control block
BufferControl* bcb = BufferControl::create(dbb);
while (true)
{
try
{
bcb->bcb_rpt = FB_NEW_POOL(*bcb->bcb_bufferpool) bcb_repeat[number];
break;
}
catch (const Firebird::Exception& ex)
{
ex.stuffException(tdbb->tdbb_status_vector);
// If the buffer control block can't be allocated, memory is
// very low. Recalculate the number of buffers to account for
// page buffer overhead and reduce that number by a 25% fudge factor.
number = (sizeof(bcb_repeat) * number) / PAGE_OVERHEAD;
number -= number >> 2;
if (number < MIN_PAGE_BUFFERS)
ERR_post(Arg::Gds(isc_cache_too_small));
}
}
dbb->dbb_bcb = bcb;
bcb->bcb_page_size = dbb->dbb_page_size;
bcb->bcb_database = dbb;
bcb->bcb_flags = shared ? BCB_exclusive : 0;
//bcb->bcb_flags = BCB_exclusive; // TODO detect real state using LM
QUE_INIT(bcb->bcb_in_use);
QUE_INIT(bcb->bcb_dirty);
bcb->bcb_dirty_count = 0;
QUE_INIT(bcb->bcb_empty);
// initialization of memory is system-specific
bcb->bcb_count = memory_init(tdbb, bcb, static_cast<SLONG>(number));
bcb->bcb_free_minimum = (SSHORT) MIN(bcb->bcb_count / 4, 128);
if (bcb->bcb_count < MIN_PAGE_BUFFERS)
ERR_post(Arg::Gds(isc_cache_too_small));
// Log if requested number of page buffers could not be allocated.
if (count != (SLONG) bcb->bcb_count)
{
gds__log("Database: %s\n\tAllocated %ld page buffers of %ld requested",
tdbb->getAttachment()->att_filename.c_str(), bcb->bcb_count, count);
}
if (dbb->dbb_lock->lck_logical != LCK_EX)
dbb->dbb_ast_flags |= DBB_assert_locks;
}
void CCH_init2(thread_db* tdbb)
{
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = dbb->dbb_bcb;
if (!(bcb->bcb_flags & BCB_exclusive) || (bcb->bcb_flags & (BCB_cache_writer | BCB_writer_start)))
return;
#ifdef CACHE_READER
if (gds__thread_start(cache_reader, dbb, THREAD_high, 0, 0))
{
ERR_bugcheck_msg("cannot start thread");
}
{ // scope
Database::Checkout dcoHolder(dbb, FB_FUNCTION);
dbb->dbb_reader_init.enter();
}
#endif
const Attachment* att = tdbb->getAttachment();
if (!(dbb->dbb_flags & DBB_read_only) && !(att->att_flags & ATT_security_db))
{
// writer startup in progress
bcb->bcb_flags |= BCB_writer_start;
try
{
bcb->bcb_writer_fini.run(bcb);
}
catch (const Exception&)
{
bcb->bcb_flags &= ~BCB_writer_start;
ERR_bugcheck_msg("cannot start cache writer thread");
}
bcb->bcb_writer_init.enter();
}
}
void CCH_mark(thread_db* tdbb, WIN* window, bool mark_system, bool must_write)
{
/**************************************
*
* C C H _ m a r k
*
**************************************
*
* Functional description
* Mark a window as dirty.
*
**************************************/
BufferDesc* bdb = window->win_bdb;
BLKCHK(bdb, type_bdb);
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
tdbb->bumpStats(RuntimeStatistics::PAGE_MARKS);
BufferControl* bcb = dbb->dbb_bcb;
if (!(bdb->bdb_flags & BDB_writer))
BUGCHECK(208); // msg 208 page not accessed for write
CCH_TRACE(("MARK %d:%06d", window->win_page.getPageSpaceID(), window->win_page.getPageNum()));
// A LATCH_mark is needed before the BufferDesc can be marked.
// This prevents a write while the page is being modified.
if (!(bdb->bdb_flags & BDB_marked))
bdb->lockIO(tdbb);
fb_assert(bdb->ourIOLock());
// Allocate difference page (if in stalled mode) before mark page as dirty.
// It guarantees that disk space is allocated and page could be written later.
if (!set_diff_page(tdbb, bdb))
{
clear_dirty_flag_and_nbak_state(tdbb, bdb);
bdb->unLockIO(tdbb);
CCH_unwind(tdbb, true);
}
fb_assert(dbb->dbb_backup_manager->getState() != Ods::hdr_nbak_unknown);
bdb->bdb_incarnation = ++bcb->bcb_page_incarnation;
// mark the dirty bit vector for this specific transaction,
// if it exists; otherwise mark that the system transaction
// has updated this page
int newFlags = 0;
TraNumber number;
jrd_tra* transaction = tdbb->getTransaction();
if (transaction && (number = transaction->tra_number))
{
if (!(tdbb->tdbb_flags & TDBB_sweeper))
{
const ULONG trans_bucket = number & (BITS_PER_LONG - 1);
bdb->bdb_transactions |= (1L << trans_bucket);
if (number > bdb->bdb_mark_transaction)
bdb->bdb_mark_transaction = number;
}
}
else
newFlags |= BDB_system_dirty;
if (mark_system)
newFlags |= BDB_system_dirty;
/*if (bcb->bcb_flags & BCB_exclusive) */
newFlags |= BDB_db_dirty;
if (must_write || dbb->dbb_backup_manager->databaseFlushInProgress())
newFlags |= BDB_must_write;
bdb->bdb_flags |= newFlags;
if (!(tdbb->tdbb_flags & TDBB_sweeper) || (bdb->bdb_flags & BDB_system_dirty))
insertDirty(bcb, bdb);
bdb->bdb_flags |= BDB_marked | BDB_dirty;
}
void CCH_must_write(thread_db* tdbb, WIN* window)
{
/**************************************
*
* C C H _ m u s t _ w r i t e
*
**************************************
*
* Functional description
* Mark a window as "must write".
*
**************************************/
SET_TDBB(tdbb);
BufferDesc* bdb = window->win_bdb;
BLKCHK(bdb, type_bdb);
if (!(bdb->bdb_flags & BDB_marked) || !(bdb->bdb_flags & BDB_dirty)) {
BUGCHECK(208); // msg 208 page not accessed for write
}
bdb->bdb_flags |= BDB_must_write | BDB_dirty;
fb_assert((bdb->bdb_flags & BDB_nbak_state_lock) ||
PageSpace::isTemporary(bdb->bdb_page.getPageSpaceID()));
}
void CCH_precedence(thread_db* tdbb, WIN* window, ULONG pageNum)
{
const USHORT pageSpaceID = pageNum > FIRST_PIP_PAGE ?
window->win_page.getPageSpaceID() : DB_PAGE_SPACE;
CCH_precedence(tdbb, window, PageNumber(pageSpaceID, pageNum));
}
void CCH_tra_precedence(thread_db* tdbb, WIN* window, TraNumber traNum)
{
/*
if (traNum <= tdbb->getDatabase()->dbb_last_header_write)
{
return;
}
*/
check_precedence(tdbb, window, PageNumber(TRANS_PAGE_SPACE, traNum));
}
void CCH_precedence(thread_db* tdbb, WIN* window, PageNumber page)
{
/**************************************
*
* C C H _ p r e c e d e n c e
*
**************************************
*
* Functional description
* Given a window accessed for write and a page number,
* establish a precedence relationship such that the
* specified page will always be written before the page
* associated with the window.
*
* If the "page number" is negative, it is really a transaction
* id. In this case, the precedence relationship is to the
* database header page from which the transaction id was
* obtained. If the header page has been written since the
* transaction id was assigned, no precedence relationship
* is required.
*
**************************************/
// If the page is zero, the caller isn't really serious
if (page.getPageNum() == 0)
return;
// no need to support precedence for temporary pages
if (page.isTemporary() || window->win_page.isTemporary())
return;
check_precedence(tdbb, window, page);
}
#ifdef CACHE_READER
void CCH_prefetch(thread_db* tdbb, SLONG* pages, SSHORT count)
{
/**************************************
*
* C C H _ p r e f e t c h
*
**************************************
*
* Functional description
* Given a vector of pages, set corresponding bits
* in global prefetch bitmap. Initiate an asynchronous
* I/O and get the cache reader reading in our behalf
* as well.
*
**************************************/
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = dbb->dbb_bcb;
if (!count || !(bcb->bcb_flags & BCB_cache_reader))
{
// Caller isn't really serious.
return;
}
// Switch default pool to permanent pool for setting bits in prefetch bitmap.
Jrd::ContextPoolHolder context(tdbb, bcb->bcb_bufferpool);
// The global prefetch bitmap is the key to the I/O coalescense mechanism which dovetails
// all thread prefetch requests to minimize sequential I/O requests.
// It also enables multipage I/O by implicitly sorting page vector requests.
SLONG first_page = 0;
for (const SLONG* const end = pages + count; pages < end;)
{
const SLONG page = *pages++;
if (page)
{
SBM_set(tdbb, &bcb->bcb_prefetch, page);
if (!first_page)
first_page = page;
}
}
// Not likely that the caller's page vector was empty but check anyway.
if (first_page)
{
prf prefetch;
prefetch_init(&prefetch, tdbb);
prefetch_prologue(&prefetch, &first_page);
prefetch_io(&prefetch, tdbb->tdbb_status_vector);
prefetch_epilogue(&prefetch, tdbb->tdbb_status_vector);
}
}
bool CCH_prefetch_pages(thread_db* tdbb)
{
/**************************************
*
* C C H _ p r e f e t c h _ p a g e s
*
**************************************
*
* Functional description
* Check the prefetch bitmap for a set
* of pages and read them into the cache.
*
**************************************/
// Placeholder to be implemented when predictive prefetch is
// enabled. This is called by VIO/garbage_collector() when it
// is idle to help satisfy the prefetch demand.
return false;
}
#endif // CACHE_READER
bool set_diff_page(thread_db* tdbb, BufferDesc* bdb)
{
Database* const dbb = tdbb->getDatabase();
BackupManager* const bm = dbb->dbb_backup_manager;
// Temporary pages don't write to delta and need no SCN
PageSpace* pageSpace = dbb->dbb_page_manager.findPageSpace(bdb->bdb_page.getPageSpaceID());
fb_assert(pageSpace);
if (pageSpace->isTemporary())
return true;
// Take backup state lock
if (!(tdbb->tdbb_flags & TDBB_backup_write_locked))
{
const AtomicCounter::counter_type oldFlags = bdb->bdb_flags.exchangeBitOr(BDB_nbak_state_lock);
if (!(oldFlags & BDB_nbak_state_lock))
{
NBAK_TRACE(("lock state for dirty page %d:%06d",
bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum()));
bm->lockStateRead(tdbb, LCK_WAIT);
}
}
else
fb_assert(bdb->bdb_page == HEADER_PAGE_NUMBER);
if (bdb->bdb_page != HEADER_PAGE_NUMBER)
{
// SCN of header page is adjusted in nbak.cpp
if (bdb->bdb_buffer->pag_scn != bm->getCurrentSCN())
{
bdb->bdb_buffer->pag_scn = bm->getCurrentSCN(); // Set SCN for the page
// At PAG_set_page_scn() below we could dirty SCN page and thus acquire
// nbackup state lock recursively. Since RWLock allows it, we are safe.
// Else we should release just taken state lock before call of
// PAG_set_page_scn() and acquire it again. If current SCN changes meanwhile
// we should repeat whole process again...
win window(bdb->bdb_page);
window.win_bdb = bdb;
window.win_buffer = bdb->bdb_buffer;
PAG_set_page_scn(tdbb, &window);
}
fb_assert(bdb->bdb_buffer->pag_scn == bm->getCurrentSCN());
}
// Determine location of the page in difference file and write destination
// so BufferDesc AST handlers and write_page routine can safely use this information
const int backup_state = bm->getState();
if (backup_state == Ods::hdr_nbak_normal)
return true;
switch (backup_state)
{
case Ods::hdr_nbak_stalled:
bdb->bdb_difference_page = bm->getPageIndex(tdbb, bdb->bdb_page.getPageNum());
if (!bdb->bdb_difference_page)
{
bdb->bdb_difference_page = bm->allocateDifferencePage(tdbb, bdb->bdb_page.getPageNum());
if (!bdb->bdb_difference_page) {
return false;
}
NBAK_TRACE(("Allocate difference page %d for database page %d",
bdb->bdb_difference_page, bdb->bdb_page));
}
else
{
NBAK_TRACE(("Map existing difference page %d to database page %d",
bdb->bdb_difference_page, bdb->bdb_page));
}
break;
case Ods::hdr_nbak_merge:
bdb->bdb_difference_page = bm->getPageIndex(tdbb, bdb->bdb_page.getPageNum());
if (bdb->bdb_difference_page)
{
NBAK_TRACE(("Map existing difference page %d to database page %d (write_both)",
bdb->bdb_difference_page, bdb->bdb_page));
}
break;
}
return true;
}
void CCH_release(thread_db* tdbb, WIN* window, const bool release_tail)
{
/**************************************
*
* C C H _ r e l e a s e
*
**************************************
*
* Functional description
* Release a window. If the release_tail
* flag is true then make the buffer
* least-recently-used.
*
**************************************/
SET_TDBB(tdbb);
BufferDesc* const bdb = window->win_bdb;
BLKCHK(bdb, type_bdb);
BufferControl* bcb = bdb->bdb_bcb;
CCH_TRACE(("RELEASE %d:%06d", window->win_page.getPageSpaceID(), window->win_page.getPageNum()));
// A large sequential scan has requested that the garbage
// collector garbage collect. Mark the buffer so that the
// page isn't released to the LRU tail before the garbage
// collector can process the page.
if (window->win_flags & WIN_large_scan && window->win_flags & WIN_garbage_collect)
{
bdb->bdb_flags |= BDB_garbage_collect;
window->win_flags &= ~WIN_garbage_collect;
}
const bool mustWrite = (bdb->bdb_flags & BDB_must_write) ||
bcb->bcb_database->dbb_backup_manager->databaseFlushInProgress();
// if (bdb->bdb_writers == 1 || bdb->bdb_use_count == 1)
if (bdb->bdb_writers == 1 ||
(bdb->bdb_writers == 0 && mustWrite))
{
const bool marked = bdb->bdb_flags & BDB_marked;
bdb->bdb_flags &= ~(BDB_writer | BDB_marked | BDB_faked);
if (marked)
bdb->unLockIO(tdbb);
if (mustWrite)
{
// Downgrade exclusive latch to shared to allow concurrent share access
// to page during I/O.
bdb->downgrade(SYNC_SHARED);
if (!write_buffer(tdbb, bdb, bdb->bdb_page, false, tdbb->tdbb_status_vector, true))
{
insertDirty(bcb, bdb);
CCH_unwind(tdbb, true);
}
}
}
if (bdb->bdb_use_count == 1)
{
if (bdb->bdb_flags & BDB_no_blocking_ast)
{
if (bdb->bdb_flags & (BDB_db_dirty | BDB_dirty))
{
if (!write_buffer(tdbb, bdb, bdb->bdb_page, false, tdbb->tdbb_status_vector, true))
{
// Reassert blocking AST after write failure with dummy lock convert
// to same level. This will re-enable blocking AST notification.
{ // scope
ThreadStatusGuard temp_status(tdbb);
LCK_convert_opt(tdbb, bdb->bdb_lock, bdb->bdb_lock->lck_logical);
}
CCH_unwind(tdbb, true);
}
}
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
bdb->bdb_flags &= ~BDB_no_blocking_ast;
bdb->bdb_ast_flags &= ~BDB_blocking;
}
// Make buffer the least-recently-used by queueing it to the LRU tail
if (release_tail)
{
if ((window->win_flags & WIN_large_scan && bdb->bdb_scan_count > 0 &&
!(--bdb->bdb_scan_count) && !(bdb->bdb_flags & BDB_garbage_collect)) ||
(window->win_flags & WIN_garbage_collector && bdb->bdb_flags & BDB_garbage_collect &&
!bdb->bdb_scan_count))
{
if (window->win_flags & WIN_garbage_collector)
bdb->bdb_flags &= ~BDB_garbage_collect;
{ // bcb_syncLRU scope
Sync lruSync(&bcb->bcb_syncLRU, "CCH_release");
lruSync.lock(SYNC_EXCLUSIVE);
if (bdb->bdb_flags & BDB_lru_chained)
{
requeueRecentlyUsed(bcb);
}
QUE_DELETE(bdb->bdb_in_use);
QUE_APPEND(bcb->bcb_in_use, bdb->bdb_in_use);
}
if ((bcb->bcb_flags & BCB_cache_writer) &&
(bdb->bdb_flags & (BDB_dirty | BDB_db_dirty)) )
{
insertDirty(bcb, bdb);
bcb->bcb_flags |= BCB_free_pending;
if (!(bcb->bcb_flags & BCB_writer_active))
{
bcb->bcb_writer_sem.release();
}
}
}
}
}
bdb->release(tdbb, true);
window->win_bdb = NULL;
}
void CCH_release_exclusive(thread_db* tdbb)
{
/**************************************
*
* C C H _ r e l e a s e _ e x c l u s i v e
*
**************************************
*
* Functional description
* Release exclusive access to database.
*
**************************************/
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
dbb->dbb_flags &= ~DBB_exclusive;
Jrd::Attachment* attachment = tdbb->getAttachment();
if (attachment)
attachment->att_flags &= ~ATT_exclusive;
if (dbb->dbb_ast_flags & DBB_blocking)
LCK_re_post(tdbb, dbb->dbb_lock);
}
bool CCH_rollover_to_shadow(thread_db* tdbb, Database* dbb, jrd_file* file, const bool inAst)
{
/**************************************
*
* C C H _ r o l l o v e r _ t o _ s h a d o w
*
**************************************
*
* Functional description
* An I/O error has been detected on the
* main database file. Roll over to use
* the shadow file.
*
**************************************/
SET_TDBB(tdbb);
// Is the shadow subsystem yet initialized
if (!dbb->dbb_shadow_lock)
return false;
// hvlad: if there are no shadows can't rollover
// this is a temporary solution to prevent 100% CPU load
// in write_page in case of PIO_write failure
if (!dbb->dbb_shadow)
return false;
// notify other process immediately to ensure all read from sdw
// file instead of db file
return SDW_rollover_to_shadow(tdbb, file, inAst);
}
void CCH_shutdown(thread_db* tdbb)
{
/**************************************
*
* C C H _ s h u t d o w n
*
**************************************
*
* Functional description
* Shutdown database physical page locks.
*
**************************************/
Database* const dbb = tdbb->getDatabase();
BufferControl* const bcb = dbb->dbb_bcb;
if (!bcb)
return;
#ifdef CACHE_READER
// Shutdown the dedicated cache reader for this database
if (bcb->bcb_flags & BCB_cache_reader)
{
bcb->bcb_flags &= ~BCB_cache_reader;
dbb->dbb_reader_sem.release();
dbb->dbb_reader_fini.enter();
}
#endif
// Wait for cache writer startup to complete
while (bcb->bcb_flags & BCB_writer_start)
Thread::yield();
// Shutdown the dedicated cache writer for this database
if (bcb->bcb_flags & BCB_cache_writer)
{
bcb->bcb_flags &= ~BCB_cache_writer;
bcb->bcb_writer_sem.release(); // Wake up running thread
bcb->bcb_writer_fini.waitForCompletion();
}
SyncLockGuard bcbSync(&bcb->bcb_syncObject, SYNC_EXCLUSIVE, "CCH_shutdown");
// Flush and release page buffers
bcb_repeat* tail = bcb->bcb_rpt;
const bcb_repeat* const end = tail + bcb->bcb_count;
if (tail && tail->bcb_bdb)
{
try
{
if (dbb->dbb_flags & DBB_bugcheck)
LongJump::raise();
CCH_flush(tdbb, FLUSH_FINI, 0);
}
catch (const Exception&)
{
for (; tail < end; tail++)
{
BufferDesc* const bdb = tail->bcb_bdb;
if (dbb->dbb_flags & DBB_bugcheck)
{
bdb->bdb_flags &= ~BDB_db_dirty;
clear_dirty_flag_and_nbak_state(tdbb, bdb);
}
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
}
}
}
// close the database file and all associated shadow files
dbb->dbb_page_manager.closeAll();
SDW_close();
}
void CCH_unwind(thread_db* tdbb, const bool punt)
{
/**************************************
*
* C C H _ u n w i n d
*
**************************************
*
* Functional description
* Synchronously unwind cache after I/O or lock error.
*
**************************************/
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
// CCH_unwind is called when any of the following occurs:
// - IO error
// - journaling error => obsolete
// - bad page checksum => obsolete
// - wrong page type
// - page locking (not latching) deadlock
BufferControl* bcb = dbb->dbb_bcb;
if (!bcb || (tdbb->tdbb_flags & TDBB_no_cache_unwind))
{
if (punt)
ERR_punt();
return;
}
// A cache error has occurred. Scan the cache for buffers
// which may be in use and release them.
for (FB_SIZE_T n = 0; n < tdbb->tdbb_bdbs.getCount(); ++n)
{
BufferDesc *bdb = tdbb->tdbb_bdbs[n];
if (bdb)
{
if (bdb->bdb_flags & BDB_marked)
BUGCHECK(268); // msg 268 buffer marked during cache unwind
if (bdb->ourIOLock())
{
bdb->unLockIO(tdbb);
}
else
{
if (bdb->ourExclusiveLock())
bdb->bdb_flags &= ~(BDB_writer | BDB_faked | BDB_must_write);
bdb->release(tdbb, true);
}
}
}
/***
bcb_repeat* tail = bcb->bcb_rpt;
for (const bcb_repeat* const end = tail + bcb->bcb_count; tail < end; tail++)
{
BufferDesc* bdb = tail->bcb_bdb;
if (!bdb->bdb_use_count) {
continue;
}
if (bdb->bdb_io == tdbb) {
release_bdb(tdbb, bdb, true, false, false);
}
if (bdb->bdb_exclusive == tdbb)
{
if (bdb->bdb_flags & BDB_marked) {
BUGCHECK(268); // msg 268 buffer marked during cache unwind
}
bdb->bdb_flags &= ~(BDB_writer | BDB_faked | BDB_must_write);
release_bdb(tdbb, bdb, true, false, false);
}
// hvlad : as far as I understand thread can't hold more than two shared latches
// on the same bdb, so findSharedLatch below will not be called many times
SharedLatch* latch = findSharedLatch(tdbb, bdb);
while (latch)
{
release_bdb(tdbb, bdb, true, false, false);
latch = findSharedLatch(tdbb, bdb);
}
#ifndef SUPERSERVER
const pag* const page = bdb->bdb_buffer;
if (page->pag_type == pag_header || page->pag_type == pag_transactions)
{
++bdb->bdb_use_count;
clear_dirty_flag_and_nbak_state(tdbb, bdb);
bdb->bdb_flags &= ~(BDB_writer | BDB_marked | BDB_faked | BDB_db_dirty);
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
--bdb->bdb_use_count;
}
#endif
}
***/
tdbb->tdbb_flags |= TDBB_cache_unwound;
if (punt)
ERR_punt();
}
bool CCH_validate(WIN* window)
{
/**************************************
*
* C C H _ v a l i d a t e
*
**************************************
*
* Functional description
* Give a page a quick once over looking for unhealthyness.
*
**************************************/
BufferDesc* bdb = window->win_bdb;
// If page is marked for write, checksum is questionable
if ((bdb->bdb_flags & (BDB_dirty | BDB_db_dirty)))
return true;
return (bdb->bdb_buffer->pag_pageno == bdb->bdb_page.getPageNum());
}
bool CCH_write_all_shadows(thread_db* tdbb, Shadow* shadow, BufferDesc* bdb, Ods::pag* page,
FbStatusVector* status, const bool inAst)
{
/**************************************
*
* C C H _ w r i t e _ a l l _ s h a d o w s
*
**************************************
*
* Functional description
* Compute a checksum and write a page out to all shadows
* detecting failure on write.
* If shadow is null, write to all shadows, otherwise only to specified
* shadow.
*
**************************************/
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
Shadow* sdw = shadow ? shadow : dbb->dbb_shadow;
if (!sdw)
return true;
bool result = true;
Firebird::UCharBuffer spare_buffer;
if (bdb->bdb_page == HEADER_PAGE_NUMBER)
{
Ods::pag* newPage = (pag*) spare_buffer.getBuffer(dbb->dbb_page_size);
memcpy(newPage, page, HDR_SIZE);
page = newPage;
memset((UCHAR*) page + HDR_SIZE, 0, dbb->dbb_page_size - HDR_SIZE);
}
page->pag_pageno = bdb->bdb_page.getPageNum();
for (; sdw; sdw = sdw->sdw_next)
{
// don't bother to write to the shadow if it is no longer viable
/* Fix for bug 7925. drop_gdb fails to remove secondary file if
the shadow is conditional. Reason being the header page not
being correctly initialized.
The following block was not being performed for a conditional
shadow since SDW_INVALID expanded to include conditional shadow
-Sudesh 07/10/95
old code --> if (sdw->sdw_flags & SDW_INVALID)
*/
if ((sdw->sdw_flags & SDW_INVALID) && !(sdw->sdw_flags & SDW_conditional))
continue;
if (bdb->bdb_page == HEADER_PAGE_NUMBER)
{
// fixup header for shadow file
jrd_file* shadow_file = sdw->sdw_file;
header_page* header = (header_page*) page;
PageSpace* pageSpaceID = dbb->dbb_page_manager.findPageSpace(DB_PAGE_SPACE);
const UCHAR* q = (UCHAR *) pageSpaceID->file->fil_string;
header->hdr_data[0] = HDR_end;
header->hdr_end = HDR_SIZE;
header->hdr_next_page = 0;
PAG_add_header_entry(tdbb, header, HDR_root_file_name,
(USHORT) strlen((const char*) q), q);
jrd_file* next_file = shadow_file->fil_next;
if (next_file)
{
q = (UCHAR *) next_file->fil_string;
const SLONG last = next_file->fil_min_page - 1;
PAG_add_header_entry(tdbb, header, HDR_file, (USHORT) strlen((const char*) q), q);
PAG_add_header_entry(tdbb, header, HDR_last_page, sizeof(last), (const UCHAR*) &last);
}
header->hdr_flags |= hdr_active_shadow;
header->hdr_header.pag_pageno = bdb->bdb_page.getPageNum();
}
// This condition makes sure that PIO_write is performed in case of
// conditional shadow only if the page is Header page
//
// -Sudesh 07/10/95
if ((sdw->sdw_flags & SDW_conditional) && bdb->bdb_page != HEADER_PAGE_NUMBER)
continue;
// if a write failure happens on an AUTO shadow, mark the
// shadow to be deleted at the next available opportunity when we
// know we don't have a page fetched
if (!PIO_write(tdbb, sdw->sdw_file, bdb, page, status))
{
if (sdw->sdw_flags & SDW_manual)
result = false;
else
{
sdw->sdw_flags |= SDW_delete;
if (!inAst && SDW_check_conditional(tdbb))
{
if (SDW_lck_update(tdbb, 0))
{
SDW_notify(tdbb);
CCH_unwind(tdbb, false);
SDW_dump_pages(tdbb);
ERR_post(Arg::Gds(isc_deadlock));
}
}
}
}
// If shadow was specified, break out of loop
if (shadow) {
break;
}
}
return result;
}
static void adjust_scan_count(WIN* window, bool mustRead)
{
/**************************************
*
* a d j u s t _ s c a n _ c o u n t
*
**************************************/
BufferDesc* bdb = window->win_bdb;
// If a page was read or prefetched on behalf of a large scan
// then load the window scan count into the buffer descriptor.
// This buffer scan count is decremented by releasing a buffer
// with CCH_RELEASE_TAIL.
// Otherwise zero the buffer scan count to prevent the buffer
// from being queued to the LRU tail.
if (window->win_flags & WIN_large_scan)
{
if (mustRead || (bdb->bdb_flags & BDB_prefetch) || bdb->bdb_scan_count < 0)
bdb->bdb_scan_count = window->win_scans;
}
else if (window->win_flags & WIN_garbage_collector)
{
if (mustRead)
bdb->bdb_scan_count = -1;
if (bdb->bdb_flags & BDB_garbage_collect)
window->win_flags |= WIN_garbage_collect;
}
else if (window->win_flags & WIN_secondary)
{
if (mustRead)
bdb->bdb_scan_count = -1;
}
else
{
bdb->bdb_scan_count = 0;
if (bdb->bdb_flags & BDB_garbage_collect)
bdb->bdb_flags &= ~BDB_garbage_collect;
}
}
static BufferDesc* alloc_bdb(thread_db* tdbb, BufferControl* bcb, UCHAR** memory)
{
/**************************************
*
* a l l o c _ b d b
*
**************************************
*
* Functional description
* Allocate buffer descriptor block.
*
**************************************/
SET_TDBB(tdbb);
BufferDesc* bdb = FB_NEW_POOL(*bcb->bcb_bufferpool) BufferDesc(bcb);
try {
bdb->bdb_lock = alloc_page_lock(tdbb, bdb);
}
catch (const Firebird::Exception&)
{
delete bdb;
throw;
}
bdb->bdb_buffer = (pag*) *memory;
*memory += bcb->bcb_page_size;
QUE_INSERT(bcb->bcb_empty, bdb->bdb_que);
return bdb;
}
static Lock* alloc_page_lock(thread_db* tdbb, BufferDesc* bdb)
{
/**************************************
*
* a l l o c _ p a g e _ l o c k
*
**************************************
*
* Functional description
* Allocate a page-type lock.
*
**************************************/
SET_TDBB(tdbb);
Database* const dbb = tdbb->getDatabase();
BufferControl* const bcb = bdb->bdb_bcb;
const USHORT lockLen = PageNumber::getLockLen();
return FB_NEW_RPT(*bcb->bcb_bufferpool, lockLen)
Lock(tdbb, lockLen, LCK_bdb, bdb, blocking_ast_bdb);
}
static int blocking_ast_bdb(void* ast_object)
{
/**************************************
*
* b l o c k i n g _ a s t _ b d b
*
**************************************
*
* Functional description
* Blocking AST for buffer control blocks. This is called at
* AST (signal) level to indicate that a lock is blocking another
* process. If the BufferDesc* is in use, set a flag indicating that the
* lock is blocking and return. When the BufferDesc is released at normal
* level the lock will be down graded. If the BufferDesc* is not in use,
* write the page if dirty then downgrade lock. Things would be
* much hairier if UNIX supported asynchronous IO, but it doesn't.
* WHEW!
*
**************************************/
// CVC: I assume we need the function call but not the variable.
/*ThreadSync* const thread = */ThreadSync::getThread("blocking_ast_bdb");
BufferDesc* const bdb = static_cast<BufferDesc*>(ast_object);
try
{
BufferControl* const bcb = bdb->bdb_bcb;
fb_assert(!(bcb->bcb_flags & BCB_exclusive));
Database* const dbb = bcb->bcb_database;
fb_assert(dbb);
AsyncContextHolder tdbb(dbb, FB_FUNCTION);
// Do some fancy footwork to make sure that pages are
// not removed from the btc tree at AST level. Then
// restore the flag to whatever it was before.
const bool keep_pages = (bcb->bcb_flags & BCB_keep_pages) != 0;
bcb->bcb_flags |= BCB_keep_pages;
down_grade(tdbb, bdb);
if (!keep_pages)
bcb->bcb_flags &= ~BCB_keep_pages;
if (tdbb->tdbb_status_vector->getState() & IStatus::STATE_ERRORS)
iscDbLogStatus(dbb->dbb_filename.c_str(), tdbb->tdbb_status_vector);
}
catch (const Firebird::Exception&)
{
return -1;
} // no-op
return 0;
}
// Remove cleared precedence blocks from high precedence queue
static void purgePrecedence(BufferControl* bcb, BufferDesc* bdb)
{
Sync precSync(&bcb->bcb_syncPrecedence, "purgePrecedence");
precSync.lock(SYNC_EXCLUSIVE);
QUE que_prec = bdb->bdb_higher.que_forward, next_prec;
for (; que_prec != &bdb->bdb_higher; que_prec = next_prec)
{
next_prec = que_prec->que_forward;
Precedence* precedence = BLOCK(que_prec, Precedence, pre_higher);
if (precedence->pre_flags & PRE_cleared)
{
QUE_DELETE(precedence->pre_higher);
QUE_DELETE(precedence->pre_lower);
precedence->pre_hi = (BufferDesc*) bcb->bcb_free;
bcb->bcb_free = precedence;
}
}
}
// Collect pages modified by given or system transaction and write it to disk.
// See also comments in flushPages.
static void flushDirty(thread_db* tdbb, SLONG transaction_mask, const bool sys_only)
{
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = dbb->dbb_bcb;
Firebird::HalfStaticArray<BufferDesc*, 1024> flush;
{ // dirtySync scope
Sync dirtySync(&bcb->bcb_syncDirtyBdbs, "flushDirty");
dirtySync.lock(SYNC_EXCLUSIVE);
QUE que_inst = bcb->bcb_dirty.que_forward, next;
for (; que_inst != &bcb->bcb_dirty; que_inst = next)
{
next = que_inst->que_forward;
BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_dirty);
if (!(bdb->bdb_flags & BDB_dirty))
{
removeDirty(bcb, bdb);
continue;
}
if ((transaction_mask & bdb->bdb_transactions) ||
(bdb->bdb_flags & BDB_system_dirty) ||
(!transaction_mask && !sys_only) ||
(!bdb->bdb_transactions))
{
flush.add(bdb);
}
}
}
flushPages(tdbb, FLUSH_TRAN, flush.begin(), flush.getCount());
}
// Collect pages modified by garbage collector or all dirty pages or release page
// locks - depending of flush_flag, and write it to disk.
// See also comments in flushPages.
static void flushAll(thread_db* tdbb, USHORT flush_flag)
{
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = dbb->dbb_bcb;
Firebird::HalfStaticArray<BufferDesc*, 1024> flush(bcb->bcb_dirty_count);
const bool all_flag = (flush_flag & FLUSH_ALL) != 0;
const bool sweep_flag = (flush_flag & FLUSH_SWEEP) != 0;
const bool release_flag = (flush_flag & FLUSH_RLSE) != 0;
const bool write_thru = release_flag;
for (ULONG i = 0; i < bcb->bcb_count; i++)
{
BufferDesc* bdb = bcb->bcb_rpt[i].bcb_bdb;
if (bdb->bdb_flags & (BDB_db_dirty | BDB_dirty))
{
if (bdb->bdb_flags & BDB_dirty)
flush.add(bdb);
else if (bdb->bdb_flags & BDB_db_dirty)
{
// pages modified by sweep\garbage collector are not in dirty list
const bool dirty_list = (bdb->bdb_dirty.que_forward != &bdb->bdb_dirty);
if (all_flag || (sweep_flag && !dirty_list))
flush.add(bdb);
}
}
else if (release_flag)
{
bdb->addRef(tdbb, SYNC_EXCLUSIVE);
if (bdb->bdb_use_count > 1)
BUGCHECK(210); // msg 210 page in use during flush
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
bdb->release(tdbb, false);
}
}
flushPages(tdbb, flush_flag, flush.begin(), flush.getCount());
}
// Used in qsort below
extern "C" {
static int cmpBdbs(const void* a, const void* b)
{
const BufferDesc* bdbA = *(BufferDesc**)a;
const BufferDesc* bdbB = *(BufferDesc**)b;
if (bdbA->bdb_page > bdbB->bdb_page)
return 1;
if (bdbA->bdb_page < bdbB->bdb_page)
return -1;
return 0;
}
} // extern C
// Write array of pages to disk in efficient order.
// First, sort pages by their numbers to make writes physically ordered and
// thus faster. At every iteration of while loop write pages which have no high
// precedence pages to ensure order preserved. If after some iteration there are
// no such pages (i.e. all of not written yet pages have high precedence pages)
// then write them all at last iteration (of course write_buffer will also check
// for precedence before write).
static void flushPages(thread_db* tdbb, USHORT flush_flag, BufferDesc** begin, FB_SIZE_T count)
{
FbStatusVector* const status = tdbb->tdbb_status_vector;
const bool all_flag = (flush_flag & FLUSH_ALL) != 0;
const bool release_flag = (flush_flag & FLUSH_RLSE) != 0;
const bool write_thru = release_flag;
qsort(begin, count, sizeof(BufferDesc*), cmpBdbs);
MarkIterator<BufferDesc*> iter(begin, count);
FB_SIZE_T written = 0;
bool writeAll = false;
while (!iter.isEmpty())
{
bool found = false;
for (; !iter.isEof(); ++iter)
{
BufferDesc* bdb = *iter;
fb_assert(bdb);
if (!bdb)
continue;
bdb->addRef(tdbb, release_flag ? SYNC_EXCLUSIVE : SYNC_SHARED);
BufferControl* bcb = bdb->bdb_bcb;
if (!writeAll)
purgePrecedence(bcb, bdb);
if (writeAll || QUE_EMPTY(bdb->bdb_higher))
{
if (release_flag)
{
if (bdb->bdb_use_count > 1)
BUGCHECK(210); // msg 210 page in use during flush
}
if (!all_flag || bdb->bdb_flags & (BDB_db_dirty | BDB_dirty))
{
if (!write_buffer(tdbb, bdb, bdb->bdb_page, write_thru, status, true))
CCH_unwind(tdbb, true);
}
// release lock before losing control over bdb, it prevents
// concurrent operations on released lock
if (release_flag)
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
bdb->release(tdbb, !release_flag && !(bdb->bdb_flags & BDB_dirty));
iter.mark();
found = true;
written++;
}
else
{
bdb->release(tdbb, false);
}
}
if (!found)
writeAll = true;
iter.rewind();
}
fb_assert(count == written);
}
#ifdef CACHE_READER
void BufferControl::cache_reader(BufferControl* bcb)
{
/**************************************
*
* c a c h e _ r e a d e r
*
**************************************
*
* Functional description
* Prefetch pages into cache for sequential scans.
* Use asynchronous I/O to keep two prefetch requests
* busy at a time.
*
**************************************/
Database* dbb = bcb->bcb_database;
Database::SyncGuard dsGuard(dbb);
FbLocalStatus status_vector;
// Establish a thread context.
ThreadContextHolder tdbb(status_vector);
// Dummy attachment needed for lock owner identification.
tdbb->setDatabase(dbb);
Jrd::Attachment* const attachment = Attachment::create(dbb);
tdbb->setAttachment(attachment);
attachment->att_filename = dbb->dbb_filename;
Jrd::ContextPoolHolder context(tdbb, bcb->bcb_bufferpool);
// This try block is specifically to protect the LCK_init call: if
// LCK_init fails we won't be able to accomplish anything anyway, so
// return, unlike the other try blocks further down the page.
BufferControl* bcb = NULL;
try {
LCK_init(tdbb, LCK_OWNER_attachment);
TRA_init(attachment);
bcb = dbb->dbb_bcb;
bcb->bcb_flags |= BCB_cache_reader;
dbb->dbb_reader_init.post(); // Notify our creator that we have started
}
catch (const Firebird::Exception& ex)
{
ex.stuffException(status_vector);
iscDbLogStatus(dbb->dbb_filename.c_str(), status_vector);
return 0;
}
try {
// Set up dual prefetch control blocks to keep multiple prefetch
// requests active at a time. Also helps to prevent the cache reader
// from becoming dedicated or locked into a single request's prefetch demands.
prf prefetch1, prefetch2;
prefetch_init(&prefetch1, tdbb);
prefetch_init(&prefetch2, tdbb);
while (bcb->bcb_flags & BCB_cache_reader)
{
bcb->bcb_flags |= BCB_reader_active;
bool found = false;
SLONG starting_page = -1;
prf* next_prefetch = &prefetch1;
if (dbb->dbb_flags & DBB_suspend_bgio)
{
Database::Checkout dcoHolder(dbb, FB_FUNCTION);
dbb->dbb_reader_sem.tryEnter(10);
continue;
}
// Make a pass thru the global prefetch bitmap looking for something
// to read. When the entire bitmap has been scanned and found to be
// empty then wait for new requests.
do {
if (!(prefetch1.prf_flags & PRF_active) &&
SBM_next(bcb->bcb_prefetch, &starting_page, RSE_get_forward))
{
found = true;
prefetch_prologue(&prefetch1, &starting_page);
prefetch_io(&prefetch1, status_vector);
}
if (!(prefetch2.prf_flags & PRF_active) &&
SBM_next(bcb->bcb_prefetch, &starting_page, RSE_get_forward))
{
found = true;
prefetch_prologue(&prefetch2, &starting_page);
prefetch_io(&prefetch2, status_vector);
}
prf* post_prefetch = next_prefetch;
next_prefetch = (post_prefetch == &prefetch1) ? &prefetch2 : &prefetch1;
if (post_prefetch->prf_flags & PRF_active)
prefetch_epilogue(post_prefetch, status_vector);
if (found)
{
// If the cache writer or garbage collector is idle, put
// them to work prefetching pages.
#ifdef CACHE_WRITER
if ((bcb->bcb_flags & BCB_cache_writer) && !(bcb->bcb_flags & BCB_writer_active))
dbb_writer_sem.release();
#endif
#ifdef GARBAGE_THREAD
if ((dbb->dbb_flags & DBB_garbage_collector) && !(dbb->dbb_flags & DBB_gc_active))
dbb->dbb_gc_sem.release();
#endif
}
} while (prefetch1.prf_flags & PRF_active || prefetch2.prf_flags & PRF_active);
// If there's more work to do voluntarily ask to be rescheduled.
// Otherwise, wait for event notification.
BufferDesc* bdb;
if (found)
JRD_reschedule(tdbb, 0, true);
else if (bcb->bcb_flags & BCB_free_pending &&
(bdb = get_buffer(tdbb, FREE_PAGE, LATCH_none, 1)))
{
// In our spare time, help writer clean the cache.
write_buffer(tdbb, bdb, bdb->bdb_page, true, status_vector, true);
}
else
{
bcb->bcb_flags &= ~BCB_reader_active;
Database::Checkout dcoHolder(dbb, FB_FUNCTION);
dbb->dbb_reader_sem.tryEnter(10);
}
bcb = dbb->dbb_bcb;
}
LCK_fini(tdbb, LCK_OWNER_attachment);
Jrd::Attachment::destroy(attachment); // no need saving warning error strings here
tdbb->setAttachment(NULL);
bcb->bcb_flags &= ~BCB_cache_reader;
dbb->dbb_reader_fini.post();
} // try
catch (const Firebird::Exception& ex)
{
ex.stuffException(status_vector);
iscDbLogStatus(dbb->dbb_filename.c_str(), status_vector);
}
return 0;
}
#endif
void BufferControl::cache_writer(BufferControl* bcb)
{
/**************************************
*
* c a c h e _ w r i t e r
*
**************************************
*
* Functional description
* Write dirty pages to database to maintain an adequate supply of free pages.
*
**************************************/
FbLocalStatus status_vector;
Database* const dbb = bcb->bcb_database;
try
{
UserId user;
user.setUserName("Cache Writer");
Jrd::Attachment* const attachment = Jrd::Attachment::create(dbb);
RefPtr<SysStableAttachment> sAtt(FB_NEW SysStableAttachment(attachment));
attachment->setStable(sAtt);
attachment->att_filename = dbb->dbb_filename;
attachment->att_user = &user;
BackgroundContextHolder tdbb(dbb, attachment, &status_vector, FB_FUNCTION);
try
{
LCK_init(tdbb, LCK_OWNER_attachment);
PAG_header(tdbb, true);
PAG_attachment_id(tdbb);
TRA_init(attachment);
sAtt->initDone();
bcb->bcb_flags |= BCB_cache_writer;
bcb->bcb_flags &= ~BCB_writer_start;
// Notify our creator that we have started
bcb->bcb_writer_init.release();
while (bcb->bcb_flags & BCB_cache_writer)
{
bcb->bcb_flags |= BCB_writer_active;
#ifdef CACHE_READER
SLONG starting_page = -1;
#endif
if (dbb->dbb_flags & DBB_suspend_bgio)
{
EngineCheckout cout(tdbb, FB_FUNCTION);
bcb->bcb_writer_sem.tryEnter(10);
continue;
}
#ifdef SUPERSERVER_V2
// Flush buffers for lazy commit
SLONG commit_mask;
if (!(dbb->dbb_flags & DBB_force_write) && (commit_mask = dbb->dbb_flush_cycle))
{
dbb->dbb_flush_cycle = 0;
btc_flush(tdbb, commit_mask, false, status_vector);
}
#endif
if (bcb->bcb_flags & BCB_free_pending)
{
BufferDesc* const bdb = get_buffer(tdbb, FREE_PAGE, SYNC_NONE, 1);
if (bdb)
write_buffer(tdbb, bdb, bdb->bdb_page, true, &status_vector, true);
}
// If there's more work to do voluntarily ask to be rescheduled.
// Otherwise, wait for event notification.
if ((bcb->bcb_flags & BCB_free_pending) || dbb->dbb_flush_cycle)
{
JRD_reschedule(tdbb, 0, true);
}
#ifdef CACHE_READER
else if (SBM_next(bcb->bcb_prefetch, &starting_page, RSE_get_forward))
{
// Prefetch some pages in our spare time and in the process
// garbage collect the prefetch bitmap.
prf prefetch;
prefetch_init(&prefetch, tdbb);
prefetch_prologue(&prefetch, &starting_page);
prefetch_io(&prefetch, status_vector);
prefetch_epilogue(&prefetch, status_vector);
}
#endif
else
{
bcb->bcb_flags &= ~BCB_writer_active;
EngineCheckout cout(tdbb, FB_FUNCTION);
bcb->bcb_writer_sem.tryEnter(10);
}
}
}
catch (const Firebird::Exception& ex)
{
ex.stuffException(&status_vector);
iscDbLogStatus(dbb->dbb_filename.c_str(), &status_vector);
// continue execution to clean up
}
Monitoring::cleanupAttachment(tdbb);
attachment->releaseLocks(tdbb);
LCK_fini(tdbb, LCK_OWNER_attachment);
attachment->releaseRelations(tdbb);
} // try
catch (const Firebird::Exception& ex)
{
bcb->exceptionHandler(ex, cache_writer);
}
bcb->bcb_flags &= ~BCB_cache_writer;
try
{
if (bcb->bcb_flags & BCB_writer_start)
{
bcb->bcb_flags &= ~BCB_writer_start;
bcb->bcb_writer_init.release();
}
}
catch (const Firebird::Exception& ex)
{
bcb->exceptionHandler(ex, cache_writer);
}
}
void BufferControl::exceptionHandler(const Firebird::Exception& ex, BcbThreadSync::ThreadRoutine*)
{
FbLocalStatus status_vector;
ex.stuffException(&status_vector);
iscDbLogStatus(bcb_database->dbb_filename.c_str(), &status_vector);
}
static void check_precedence(thread_db* tdbb, WIN* window, PageNumber page)
{
/**************************************
*
* c h e c k _ p r e c e d e n c e
*
**************************************
*
* Functional description
* Given a window accessed for write and a page number,
* establish a precedence relationship such that the
* specified page will always be written before the page
* associated with the window.
*
* If the "page number" is negative, it is really a transaction
* id. In this case, the precedence relationship is to the
* database header page from which the transaction id was
* obtained. If the header page has been written since the
* transaction id was assigned, no precedence relationship
* is required.
*
**************************************/
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = dbb->dbb_bcb;
// If this is really a transaction id, sort things out
switch(page.getPageSpaceID())
{
case DB_PAGE_SPACE:
break;
case TRANS_PAGE_SPACE:
if (page.getPageNum() <= tdbb->getDatabase()->dbb_last_header_write)
return;
page = PageNumber(DB_PAGE_SPACE, 0);
break;
default:
fb_assert(false);
return;
}
// In the past negative value, passed here, meant not page, but transaction number.
// When we finally move to 32 (not 31) bit page numbers, this should be removed,
// but currently I add:
fb_assert(!(page.getPageNum() & 0x80000000));
// to help detect cases, when something possibly negative is passed.
// AP - 2011.
// Start by finding the buffer containing the high priority page
Sync bcbSync(&bcb->bcb_syncObject, "check_precedence");
bcbSync.lock(SYNC_SHARED);
BufferDesc* high = find_buffer(bcb, page, false);
bcbSync.unlock();
if (!high)
return;
// Found the higher precedence buffer. If it's not dirty, don't sweat it.
// If it's the same page, ditto.
if (!(high->bdb_flags & BDB_dirty) || (high->bdb_page == window->win_page))
return;
BufferDesc* low = window->win_bdb;
if ((low->bdb_flags & BDB_marked) && !(low->bdb_flags & BDB_faked))
BUGCHECK(212); // msg 212 CCH_precedence: block marked
// If already related, there's nothing more to do. If the precedence
// search was too complex to complete, just write the high page and
// forget about about establishing the relationship.
Sync precSync(&bcb->bcb_syncPrecedence, "check_precedence");
precSync.lock(SYNC_EXCLUSIVE);
if (QUE_NOT_EMPTY(high->bdb_lower))
{
const ULONG mark = get_prec_walk_mark(bcb);
const SSHORT relationship = related(low, high, PRE_SEARCH_LIMIT, mark);
if (relationship == PRE_EXISTS)
return;
if (relationship == PRE_UNKNOWN)
{
precSync.unlock();
const PageNumber high_page = high->bdb_page;
if (!write_buffer(tdbb, high, high_page, false, tdbb->tdbb_status_vector, true))
CCH_unwind(tdbb, true);
return;
}
}
// Check to see if we're going to create a cycle or the precedence search
// was too complex to complete. If so, force a write of the "after"
// (currently fetched) page. Assuming everyone obeys the rules and calls
// precedence before marking the buffer, everything should be ok
while (QUE_NOT_EMPTY(low->bdb_lower))
{
const ULONG mark = get_prec_walk_mark(bcb);
const SSHORT relationship = related(high, low, PRE_SEARCH_LIMIT, mark);
if (relationship == PRE_EXISTS || relationship == PRE_UNKNOWN)
{
precSync.unlock();
const PageNumber low_page = low->bdb_page;
if (!write_buffer(tdbb, low, low_page, false, tdbb->tdbb_status_vector, true))
CCH_unwind(tdbb, true);
precSync.lock(SYNC_EXCLUSIVE);
}
else
break;
}
// We're going to establish a new precedence relationship. Get a block,
// fill in the appropriate fields, and insert it into the various ques
Precedence* precedence = bcb->bcb_free;
if (precedence)
bcb->bcb_free = (Precedence*) precedence->pre_hi;
else
precedence = FB_NEW_POOL(*bcb->bcb_bufferpool) Precedence;
precedence->pre_low = low;
precedence->pre_hi = high;
precedence->pre_flags = 0;
QUE_INSERT(low->bdb_higher, precedence->pre_higher);
QUE_INSERT(high->bdb_lower, precedence->pre_lower);
// explicitly include high page in system transaction flush process
if (low->bdb_flags & BDB_system_dirty && high->bdb_flags & BDB_dirty)
high->bdb_flags |= BDB_system_dirty;
}
static void clear_precedence(thread_db* tdbb, BufferDesc* bdb)
{
/**************************************
*
* c l e a r _ p r e c e d e n c e
*
**************************************
*
* Functional description
* Clear precedence relationships to lower precedence block.
*
**************************************/
SET_TDBB(tdbb);
if (QUE_EMPTY(bdb->bdb_lower))
return;
BufferControl* const bcb = bdb->bdb_bcb;
Sync precSync(&bcb->bcb_syncPrecedence, "clear_precedence");
if (!bcb->bcb_syncPrecedence.ourExclusiveLock())
precSync.lock(SYNC_EXCLUSIVE);
// Loop thru lower precedence buffers. If any can be downgraded,
// by all means down grade them.
while (QUE_NOT_EMPTY(bdb->bdb_lower))
{
QUE que_inst = bdb->bdb_lower.que_forward;
Precedence* precedence = BLOCK(que_inst, Precedence, pre_lower);
BufferDesc* low_bdb = precedence->pre_low;
QUE_DELETE(precedence->pre_higher);
QUE_DELETE(precedence->pre_lower);
precedence->pre_hi = (BufferDesc*) bcb->bcb_free;
bcb->bcb_free = precedence;
if (!(precedence->pre_flags & PRE_cleared))
{
if (low_bdb->bdb_ast_flags & BDB_blocking)
PAGE_LOCK_RE_POST(tdbb, bcb, low_bdb->bdb_lock);
}
}
}
static BufferDesc* dealloc_bdb(BufferDesc* bdb)
{
/**************************************
*
* d e a l l o c _ b d b
*
**************************************
*
* Functional description
* Deallocate buffer descriptor block.
*
**************************************/
if (bdb)
{
delete bdb->bdb_lock;
QUE_DELETE(bdb->bdb_que);
delete bdb;
}
return NULL;
}
static void down_grade(thread_db* tdbb, BufferDesc* bdb, int high)
{
/**************************************
*
* d o w n _ g r a d e
*
**************************************
*
* Functional description
* A lock on a page is blocking another process. If possible, downgrade
* the lock on the buffer. This may be called from either AST or
* regular level. Return true if the down grade was successful. If the
* down grade was deferred for any reason, return false.
*
**************************************/
SET_TDBB(tdbb);
const bool oldBlocking = (bdb->bdb_ast_flags.exchangeBitOr(BDB_blocking) & BDB_blocking);
Lock* lock = bdb->bdb_lock;
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = bdb->bdb_bcb;
if (dbb->dbb_flags & DBB_bugcheck)
{
PAGE_LOCK_RELEASE(tdbb, bcb, lock);
bdb->bdb_ast_flags &= ~BDB_blocking;
clear_dirty_flag_and_nbak_state(tdbb, bdb);
return; // true;
}
// If the BufferDesc is in use and, being written or already
// downgraded to read, mark it as blocking and exit.
bool justWrite = false;
if (bdb->isLocked() || !bdb->addRefConditional(tdbb, SYNC_EXCLUSIVE))
{
if (!high)
return; // false;
// hvlad: buffer is in use and we can't downgrade its lock. But if this
// buffer blocks some lower precedence buffer, it is enough to just write
// our (high) buffer to clear precedence and thus allow blocked (low)
// buffer to be downgraded. IO lock guarantees that there is no buffer
// modification in progress currently and it is safe to write it right now.
// No need to mark our buffer as blocking nor to change state of our lock.
bdb->lockIO(tdbb);
if (!(bdb->bdb_flags & BDB_dirty))
{
bdb->unLockIO(tdbb);
return; // true
}
if (!oldBlocking)
bdb->bdb_ast_flags &= ~BDB_blocking;
justWrite = true;
}
// If the page isn't dirty, the lock can be quietly downgraded.
if (!(bdb->bdb_flags & BDB_dirty))
{
bdb->bdb_ast_flags &= ~BDB_blocking;
LCK_downgrade(tdbb, lock);
bdb->release(tdbb, false);
return; // true;
}
bool in_use = false, invalid = false;
if (bdb->bdb_flags & BDB_not_valid)
invalid = true;
// If there are higher precedence guys, see if they can be written.
while (QUE_NOT_EMPTY(bdb->bdb_higher))
{ // syncPrec scope
Sync syncPrec(&bcb->bcb_syncPrecedence, "down_grade");
syncPrec.lock(SYNC_EXCLUSIVE);
bool found = false;
for (QUE que_inst = bdb->bdb_higher.que_forward; que_inst != &bdb->bdb_higher;
que_inst = que_inst->que_forward)
{
Precedence* precedence = BLOCK(que_inst, Precedence, pre_higher);
if (precedence->pre_flags & PRE_cleared)
continue;
if (invalid)
{
precedence->pre_flags |= PRE_cleared;
continue;
}
BufferDesc* blocking_bdb = precedence->pre_hi;
if (blocking_bdb->bdb_flags & BDB_dirty)
{
found = true;
syncPrec.unlock();
down_grade(tdbb, blocking_bdb, high + 1);
if ((blocking_bdb->bdb_flags & BDB_dirty) && !(precedence->pre_flags & PRE_cleared))
in_use = true;
if (blocking_bdb->bdb_flags & BDB_not_valid)
{
invalid = true;
in_use = false;
que_inst = bdb->bdb_higher.que_forward;
}
break;
}
}
// If any higher precedence buffer can't be written, mark this buffer as blocking and exit.
if (in_use)
{
if (justWrite)
bdb->unLockIO(tdbb);
else
bdb->release(tdbb, false);
return; // false;
}
if (!found)
break;
} // syncPrec scope
// Everything is clear to write this buffer. Do so and reduce the lock
if (!justWrite)
bdb->lockIO(tdbb);
const bool written = !(bdb->bdb_flags & BDB_dirty) ||
write_page(tdbb, bdb, tdbb->tdbb_status_vector, true);
if (!justWrite)
bdb->unLockIO(tdbb);
if (invalid || !written)
{
bdb->bdb_flags |= BDB_not_valid;
clear_dirty_flag_and_nbak_state(tdbb, bdb);
bdb->bdb_ast_flags &= ~BDB_blocking;
TRA_invalidate(tdbb, bdb->bdb_transactions);
bdb->bdb_transactions = 0;
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
}
else if (!justWrite)
{
bdb->bdb_ast_flags &= ~BDB_blocking;
LCK_downgrade(tdbb, lock);
}
// Clear precedence relationships to lower precedence buffers. Since it
// isn't safe to tweak the que pointers from AST level, just mark the precedence
// links as cleared. Somebody else will clean up the precedence blocks.
while (QUE_NOT_EMPTY(bdb->bdb_lower))
{ // syncPrec scope
Sync syncPrec(&bcb->bcb_syncPrecedence, "down_grade");
syncPrec.lock(SYNC_EXCLUSIVE);
bool found = false;
for (QUE que_inst = bdb->bdb_lower.que_forward; que_inst != &bdb->bdb_lower;
que_inst = que_inst->que_forward)
{
Precedence* precedence = BLOCK(que_inst, Precedence, pre_lower);
if (precedence->pre_flags & PRE_cleared)
continue;
BufferDesc* blocking_bdb = precedence->pre_low;
if (bdb->bdb_flags & BDB_not_valid) {
blocking_bdb->bdb_flags |= BDB_not_valid;
}
precedence->pre_flags |= PRE_cleared;
if ((blocking_bdb->bdb_flags & BDB_not_valid) || (blocking_bdb->bdb_ast_flags & BDB_blocking))
{
found = true;
syncPrec.unlock();
down_grade(tdbb, blocking_bdb);
break;
}
}
if (!found)
break;
} // syncPrec scope
bdb->bdb_flags &= ~BDB_not_valid;
if (justWrite)
bdb->unLockIO(tdbb);
else
bdb->release(tdbb, false);
return; // true;
}
static bool expand_buffers(thread_db* tdbb, ULONG number)
{
/**************************************
*
* e x p a n d _ b u f f e r s
*
**************************************
*
* Functional description
* Expand the cache to at least a given number of buffers. If
* it's already that big, don't do anything.
*
* Nickolay Samofatov, 08-Mar-2004.
* This function does not handle exceptions correctly,
* it looks like good handling requires rewrite.
*
**************************************/
SET_TDBB(tdbb);
Database* const dbb = tdbb->getDatabase();
BufferControl* const bcb = dbb->dbb_bcb;
if (number <= bcb->bcb_count || number > MAX_PAGE_BUFFERS)
return false;
Sync syncBcb(&bcb->bcb_syncObject, "expand_buffers");
syncBcb.lock(SYNC_EXCLUSIVE);
// for Win16 platform, we want to ensure that no cache buffer ever ends on a segment boundary
// CVC: Is this code obsolete or only the comment?
ULONG num_per_seg = number - bcb->bcb_count;
ULONG left_to_do = num_per_seg;
// Allocate and initialize buffers control block
Jrd::ContextPoolHolder context(tdbb, bcb->bcb_bufferpool);
const bcb_repeat* const old_end = bcb->bcb_rpt + bcb->bcb_count;
bcb_repeat* const new_rpt = FB_NEW_POOL(*bcb->bcb_bufferpool) bcb_repeat[number];
bcb_repeat* const old_rpt = bcb->bcb_rpt;
bcb->bcb_rpt = new_rpt;
bcb->bcb_count = number;
bcb->bcb_free_minimum = (SSHORT) MIN(number / 4, 128); /* 25% clean page reserve */
const bcb_repeat* const new_end = bcb->bcb_rpt + number;
// Initialize tail of new buffer control block
bcb_repeat* new_tail;
for (new_tail = bcb->bcb_rpt; new_tail < new_end; new_tail++)
QUE_INIT(new_tail->bcb_page_mod);
// Move any active buffers from old block to new
new_tail = bcb->bcb_rpt;
for (bcb_repeat* old_tail = old_rpt; old_tail < old_end; old_tail++, new_tail++)
{
new_tail->bcb_bdb = old_tail->bcb_bdb;
while (QUE_NOT_EMPTY(old_tail->bcb_page_mod))
{
QUE que_inst = old_tail->bcb_page_mod.que_forward;
BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_que);
QUE_DELETE(*que_inst);
QUE mod_que = &bcb->bcb_rpt[bdb->bdb_page.getPageNum() % bcb->bcb_count].bcb_page_mod;
QUE_INSERT(*mod_que, *que_inst);
}
}
// Allocate new buffer descriptor blocks
ULONG num_in_seg = 0;
UCHAR* memory = NULL;
for (; new_tail < new_end; new_tail++)
{
// if current segment is exhausted, allocate another
if (!num_in_seg)
{
const size_t alloc_size = dbb->dbb_page_size * (num_per_seg + 1);
memory = (UCHAR*) bcb->bcb_bufferpool->allocate(alloc_size ALLOC_ARGS);
bcb->bcb_memory.push(memory);
memory = FB_ALIGN(memory, dbb->dbb_page_size);
num_in_seg = num_per_seg;
left_to_do -= num_per_seg;
if (num_per_seg > left_to_do)
num_per_seg = left_to_do;
}
new_tail->bcb_bdb = alloc_bdb(tdbb, bcb, &memory);
num_in_seg--;
}
// Set up new buffer control, release old buffer control, and clean up
delete[] old_rpt;
return true;
}
static BufferDesc* find_buffer(BufferControl* bcb, const PageNumber page, bool findPending)
{
QUE mod_que = &bcb->bcb_rpt[page.getPageNum() % bcb->bcb_count].bcb_page_mod;
QUE que_inst = mod_que->que_forward;
for (; que_inst != mod_que; que_inst = que_inst->que_forward)
{
BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_que);
if (bdb->bdb_page == page)
return bdb;
}
if (findPending)
{
que_inst = bcb->bcb_pending.que_forward;
for (; que_inst != &bcb->bcb_pending; que_inst = que_inst->que_forward)
{
BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_que);
if (bdb->bdb_page == page || bdb->bdb_pending_page == page)
return bdb;
}
}
return NULL;
}
static LatchState latch_buffer(thread_db* tdbb, Sync &bcbSync, BufferDesc *bdb,
const PageNumber page, SyncType syncType, int wait)
{
//++bdb->bdb_use_count;
if (!(bdb->bdb_flags & BDB_free_pending)
#ifdef SUPERSERVER_V2
&& (page != HEADER_PAGE_NUMBER)
#endif
)
{
recentlyUsed(bdb);
}
// If buffer is currently replacing by another page but still writting
// to disk we should wait until this write finished, else we could
// allocate another buffer and read old page image (or even zero's)
// from disk into new buffer
const bool waitPending = ((bdb->bdb_flags & BDB_free_pending) && bdb->bdb_page == page);
bcbSync.unlock();
if (waitPending)
{
//--bdb->bdb_use_count;
if (wait == 0)
return lsTimeout; // go out
Thread::yield();
}
else
{
const bool latchOk = bdb->addRef(tdbb, syncType, wait);
//--bdb->bdb_use_count;
if (!latchOk)
return lsTimeout; // go out
if (bdb->bdb_page == page)
{
//bdb->bdb_flags &= ~(BDB_faked | BDB_prefetch);
return lsOk;
}
bdb->release(tdbb, true);
}
return lsPageChanged; // try again
}
static BufferDesc* get_buffer(thread_db* tdbb, const PageNumber page, SyncType syncType, int wait)
{
/**************************************
*
* g e t _ b u f f e r
*
**************************************
*
* Functional description
* Get a buffer. If possible, get a buffer already assigned
* to the page. Otherwise get one from the free list or pick
* the least recently used buffer to be reused.
* Note the following special page numbers:
* -1 indicates that a buffer is required for journaling => obsolete
* -2 indicates a special scratch buffer for shadowing
*
* input
* page: page to get
* syncType: type of lock to acquire on the page.
* wait: 1 => Wait as long as necessary to get the lock.
* This can cause deadlocks of course.
* 0 => If the lock can't be acquired immediately,
* give up and return 0;
* <negative number> => Latch timeout interval in seconds.
*
* return
* BufferDesc pointer if successful.
* NULL pointer if timeout occurred (only possible is wait <> 1).
* if cache manager doesn't have any pages to write anymore.
*
**************************************/
SET_TDBB(tdbb);
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = dbb->dbb_bcb;
Sync bcbSync(&bcb->bcb_syncObject, "get_buffer");
if (page != FREE_PAGE)
{
bcbSync.lock(SYNC_SHARED);
BufferDesc* bdb = find_buffer(bcb, page, true);
while (bdb)
{
const LatchState ret = latch_buffer(tdbb, bcbSync, bdb, page, syncType, wait);
if (ret == lsOk)
{
tdbb->bumpStats(RuntimeStatistics::PAGE_FETCHES);
return bdb;
}
if (ret == lsTimeout)
return NULL;
bcbSync.lock(SYNC_SHARED);
bdb = find_buffer(bcb, page, true);
}
bcbSync.unlock();
}
bcbSync.lock(SYNC_EXCLUSIVE);
QUE que_inst;
int walk = bcb->bcb_free_minimum;
while (true)
{
if (page != FREE_PAGE)
{
// Check to see if buffer has already been assigned to page
BufferDesc* bdb = find_buffer(bcb, page, true);
while (bdb)
{
const LatchState ret = latch_buffer(tdbb, bcbSync, bdb, page, syncType, wait);
if (ret == lsOk)
{
tdbb->bumpStats(RuntimeStatistics::PAGE_FETCHES);
return bdb;
}
if (ret == lsTimeout)
return NULL;
bcbSync.lock(SYNC_EXCLUSIVE);
bdb = find_buffer(bcb, page, true);
}
}
else // page == FREE_PAGE
{
// This code is only used by the background I/O threads:
// cache writer, cache reader and garbage collector.
//Database::Checkout dcoHolder(dbb);
Sync lruSync(&bcb->bcb_syncLRU, "get_buffer");
lruSync.lock(SYNC_EXCLUSIVE);
for (que_inst = bcb->bcb_in_use.que_backward;
que_inst != &bcb->bcb_in_use; que_inst = que_inst->que_backward)
{
BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_in_use);
if (bdb->bdb_use_count || (bdb->bdb_flags & BDB_free_pending))
continue;
if (bdb->bdb_flags & BDB_db_dirty)
{
//tdbb->bumpStats(RuntimeStatistics::PAGE_FETCHES); shouldn't it be here?
return bdb;
}
if (!--walk)
{
bcb->bcb_flags &= ~BCB_free_pending;
break;
}
}
// hvlad: removed in Vulcan
bcb->bcb_flags &= ~BCB_free_pending;
return NULL;
}
// If there is an empty buffer sitting around, allocate it
if (QUE_NOT_EMPTY(bcb->bcb_empty))
{
que_inst = bcb->bcb_empty.que_forward;
QUE_DELETE(*que_inst);
BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_que);
bcb->bcb_inuse++;
bdb->addRef(tdbb, SYNC_EXCLUSIVE);
if (page != FREE_PAGE)
{
QUE mod_que = &bcb->bcb_rpt[page.getPageNum() % bcb->bcb_count].bcb_page_mod;
QUE_INSERT(*mod_que, *que_inst);
#ifdef SUPERSERVER_V2
// Reserve a buffer for header page with deferred header
// page write mechanism. Otherwise, a deadlock will occur
// if all dirty pages in the cache must force header page
// to disk before they can be written but there is no free
// buffer to read the header page into.
if (page != HEADER_PAGE_NUMBER)
#endif
{
Sync lruSync(&bcb->bcb_syncLRU, "get_buffer");
lruSync.lock(SYNC_EXCLUSIVE);
QUE_INSERT(bcb->bcb_in_use, bdb->bdb_in_use);
}
}
// This correction for bdb_use_count below is needed to
// avoid a deadlock situation in latching code. It's not
// clear though how the bdb_use_count can get < 0 for a bdb
// in bcb_empty queue
if (bdb->bdb_use_count < 0)
BUGCHECK(301); // msg 301 Non-zero use_count of a buffer in the empty que_inst
bdb->bdb_page = page;
bdb->bdb_flags = BDB_read_pending; // we have buffer exclusively, this is safe
bdb->bdb_scan_count = 0;
if (page != FREE_PAGE)
{
CCH_TRACE(("bdb->bdb_lock->lck_logical = LCK_none; page=%i", bdb->bdb_page.getPageNum()));
bdb->bdb_lock->lck_logical = LCK_none;
}
else
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
tdbb->bumpStats(RuntimeStatistics::PAGE_FETCHES);
return bdb;
}
Sync lruSync(&bcb->bcb_syncLRU, "get_buffer");
lruSync.lock(SYNC_EXCLUSIVE);
if (bcb->bcb_lru_chain)
requeueRecentlyUsed(bcb);
for (que_inst = bcb->bcb_in_use.que_backward;
que_inst != &bcb->bcb_in_use;
que_inst = que_inst->que_backward)
{
// get the oldest buffer as the least recently used -- note
// that since there are no empty buffers this queue cannot be empty
if (bcb->bcb_in_use.que_forward == &bcb->bcb_in_use)
BUGCHECK(213); // msg 213 insufficient cache size
BufferDesc* oldest = BLOCK(que_inst, BufferDesc, bdb_in_use);
if (oldest->bdb_flags & BDB_lru_chained)
continue;
if (oldest->bdb_use_count || !oldest->addRefConditional(tdbb, SYNC_EXCLUSIVE))
continue;
if ((oldest->bdb_flags & BDB_free_pending) || !writeable(oldest))
{
oldest->release(tdbb, true);
continue;
}
#ifdef SUPERSERVER_V2
// If page has been prefetched but not yet fetched, let
// it cycle once more thru LRU queue before re-using it.
if (oldest->bdb_flags & BDB_prefetch)
{
oldest->bdb_flags &= ~BDB_prefetch;
que_inst = que_inst->que_forward;
QUE_MOST_RECENTLY_USED(oldest->bdb_in_use);
//LATCH_MUTEX_RELEASE;
continue;
}
#endif
if ((bcb->bcb_flags & BCB_cache_writer) &&
(oldest->bdb_flags & (BDB_dirty | BDB_db_dirty)) )
{
bcb->bcb_flags |= BCB_free_pending;
if (!(bcb->bcb_flags & BCB_writer_active))
bcb->bcb_writer_sem.release();
if (walk)
{
oldest->release(tdbb, true);
if (!--walk)
break;
continue;
}
}
BufferDesc* bdb = oldest;
// hvlad: we already have bcb_lruSync here
//recentlyUsed(bdb);
fb_assert(!(bdb->bdb_flags & BDB_lru_chained));
QUE_DELETE(bdb->bdb_in_use);
QUE_INSERT(bcb->bcb_in_use, bdb->bdb_in_use);
lruSync.unlock();
bdb->bdb_flags |= BDB_free_pending;
bdb->bdb_pending_page = page;
QUE_DELETE(bdb->bdb_que);
QUE_INSERT(bcb->bcb_pending, bdb->bdb_que);
const bool needCleanup = (bdb->bdb_flags & (BDB_dirty | BDB_db_dirty)) ||
QUE_NOT_EMPTY(bdb->bdb_higher) || QUE_NOT_EMPTY(bdb->bdb_lower);
if (needCleanup)
{
bcbSync.unlock();
// If the buffer selected is dirty, arrange to have it written.
if (bdb->bdb_flags & (BDB_dirty | BDB_db_dirty))
{
const bool write_thru = (bcb->bcb_flags & BCB_exclusive);
if (!write_buffer(tdbb, bdb, bdb->bdb_page, write_thru, tdbb->tdbb_status_vector, true))
{
bcbSync.lock(SYNC_EXCLUSIVE);
bdb->bdb_flags &= ~BDB_free_pending;
QUE_DELETE(bdb->bdb_in_use);
QUE_APPEND(bcb->bcb_in_use, bdb->bdb_in_use);
bcbSync.unlock();
bdb->release(tdbb, true);
CCH_unwind(tdbb, true);
}
}
// If the buffer is still in the dirty tree, remove it.
// In any case, release any lock it may have.
removeDirty(bcb, bdb);
// Cleanup any residual precedence blocks. Unless something is
// screwed up, the only precedence blocks that can still be hanging
// around are ones cleared at AST level.
if (QUE_NOT_EMPTY(bdb->bdb_higher) || QUE_NOT_EMPTY(bdb->bdb_lower))
{
Sync precSync(&bcb->bcb_syncPrecedence, "get_buffer");
precSync.lock(SYNC_EXCLUSIVE);
while (QUE_NOT_EMPTY(bdb->bdb_higher))
{
QUE que2 = bdb->bdb_higher.que_forward;
Precedence* precedence = BLOCK(que2, Precedence, pre_higher);
QUE_DELETE(precedence->pre_higher);
QUE_DELETE(precedence->pre_lower);
precedence->pre_hi = (BufferDesc*) bcb->bcb_free;
bcb->bcb_free = precedence;
}
clear_precedence(tdbb, bdb);
}
bcbSync.lock(SYNC_EXCLUSIVE);
}
QUE_DELETE(bdb->bdb_que); // bcb_pending
QUE mod_que = &bcb->bcb_rpt[page.getPageNum() % bcb->bcb_count].bcb_page_mod;
QUE_INSERT((*mod_que), bdb->bdb_que);
bdb->bdb_flags &= ~BDB_free_pending;
// This correction for bdb_use_count below is needed to
// avoid a deadlock situation in latching code. It's not
// clear though how the bdb_use_count can get < 0 for a bdb
// in bcb_empty queue
if (bdb->bdb_use_count < 0)
BUGCHECK(301); /* msg 301 Non-zero use_count of a buffer in the empty Que */
bdb->bdb_page = page;
bdb->bdb_flags &= BDB_lru_chained; // yes, clear all except BDB_lru_chained
bdb->bdb_flags |= BDB_read_pending;
bdb->bdb_scan_count = 0;
bcbSync.unlock();
if (page != FREE_PAGE)
bdb->bdb_lock->lck_logical = LCK_none;
else
PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock);
tdbb->bumpStats(RuntimeStatistics::PAGE_FETCHES);
return bdb;
}
if (que_inst == &bcb->bcb_in_use)
expand_buffers(tdbb, bcb->bcb_count + 75);
}
}
static ULONG get_prec_walk_mark(BufferControl* bcb)
{
/**************************************
*
* g e t _ p r e c _ w a l k _ m a r k
*
**************************************
*
* Functional description
* Get next mark for walking precedence graph.
*
**************************************/
fb_assert(bcb->bcb_syncPrecedence.ourExclusiveLock());
if (++bcb->bcb_prec_walk_mark == 0)
{
for (ULONG i = 0; i < bcb->bcb_count; i++)
bcb->bcb_rpt[i].bcb_bdb->bdb_prec_walk_mark = 0;
bcb->bcb_prec_walk_mark = 1;
}
return bcb->bcb_prec_walk_mark;
}
static int get_related(BufferDesc* bdb, PagesArray &lowPages, int limit, const ULONG mark)
{
/**************************************
*
* g e t _ r e l a t e d
*
**************************************
*
* Functional description
* Recursively walk low part of precedence graph of given buffer and put
* low pages numbers into array.
*
**************************************/
BufferControl* bcb = bdb->bdb_bcb;
fb_assert(bcb->bcb_syncPrecedence.ourExclusiveLock());
const struct que* base = &bdb->bdb_lower;
for (const struct que* que_inst = base->que_forward; que_inst != base;
que_inst = que_inst->que_forward)
{
const Precedence* precedence = BLOCK(que_inst, Precedence, pre_lower);
if (precedence->pre_flags & PRE_cleared)
continue;
BufferDesc* low = precedence->pre_low;
if (low->bdb_prec_walk_mark == mark)
continue;
if (!--limit)
return 0;
const SLONG lowPage = low->bdb_page.getPageNum();
FB_SIZE_T pos;
if (!lowPages.find(lowPage, pos))
lowPages.insert(pos, lowPage);
if (QUE_NOT_EMPTY(low->bdb_lower))
{
limit = get_related(low, lowPages, limit, mark);
if (!limit)
return 0;
}
else
low->bdb_prec_walk_mark = mark;
}
bdb->bdb_prec_walk_mark = mark;
return limit;
}
static LockState lock_buffer(thread_db* tdbb, BufferDesc* bdb, const SSHORT wait,
const SCHAR page_type)
{
/**************************************
*
* l o c k _ b u f f e r
*
**************************************
*
* Functional description
* Get a lock on page for a buffer. If the lock ever slipped
* below READ, indicate that the page must be read.
*
* input:
* wait: LCK_WAIT = 1 => Wait as long a necessary to get the lock.
* LCK_NO_WAIT = 0 => If the lock can't be acquired immediately,
* give up and return -1.
* <negative number> => Lock timeout interval in seconds.
*
* return: 0 => buffer locked, page is already in memory.
* 1 => buffer locked, page needs to be read from disk.
* -1 => timeout on lock occurred, see input parameter 'wait'.
*
**************************************/
SET_TDBB(tdbb);
BufferControl* const bcb = bdb->bdb_bcb;
fb_assert(!(bcb->bcb_flags & BCB_exclusive));
const USHORT lock_type = (bdb->bdb_flags & (BDB_dirty | BDB_writer)) ? LCK_write : LCK_read;
CCH_TRACE(("FE LOCK %d:%06d, %s", bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum(),
(lock_type >= LCK_write) ? "EX" : "SH" ));
Lock* const lock = bdb->bdb_lock;
if (lock->lck_logical >= lock_type)
return lsLockedHavePage;
TEXT errmsg[MAX_ERRMSG_LEN + 1];
ThreadStatusGuard tempStatus(tdbb);
if (lock->lck_logical == LCK_none)
{
// Prevent header and TIP pages from generating blocking AST
// overhead. The promise is that the lock will unconditionally
// be released when the buffer use count indicates it is safe to do so.
if (page_type == pag_header || page_type == pag_transactions)
{
fb_assert(lock->lck_ast == blocking_ast_bdb);
fb_assert(lock->lck_object == bdb);
lock->lck_ast = 0;
lock->lck_object = NULL;
}
else {
fb_assert(lock->lck_ast != NULL);
}
bdb->bdb_page.getLockStr(lock->getKeyPtr());
if (LCK_lock_opt(tdbb, lock, lock_type, wait))
{
if (!lock->lck_ast)
{
// Restore blocking AST to lock block if it was swapped out.
// Flag the BufferDesc so that the lock is released when the buffer is released.
fb_assert(page_type == pag_header || page_type == pag_transactions);
lock->lck_ast = blocking_ast_bdb;
lock->lck_object = bdb;
bdb->bdb_flags |= BDB_no_blocking_ast;
}
return lsLocked;
}
if (!lock->lck_ast)
{
fb_assert(page_type == pag_header || page_type == pag_transactions);
lock->lck_ast = blocking_ast_bdb;
lock->lck_object = bdb;
}
// Case: a timeout was specified, or the caller didn't want to wait, return the error.
if ((wait == LCK_NO_WAIT) || ((wait < 0) && (tempStatus->getErrors()[1] == isc_lock_timeout)))
{
bdb->release(tdbb, false);
return lsLockTimeout;
}
// Case: lock manager detected a deadlock, probably caused by locking the
// BufferDesc's in an unfortunate order. Nothing we can do about it, return the
// error, and log it to firebird.log.
FbStatusVector* const status = tempStatus.restore();
fb_msg_format(0, JRD_BUGCHK, 216, sizeof(errmsg), errmsg,
MsgFormat::SafeArg() << bdb->bdb_page.getPageNum() << (int) page_type);
ERR_append_status(status, Arg::Gds(isc_random) << Arg::Str(errmsg));
ERR_log(JRD_BUGCHK, 216, errmsg); // // msg 216 page %ld, page type %ld lock denied
// CCH_unwind releases all the BufferDesc's and calls ERR_punt()
// ERR_punt will longjump.
CCH_unwind(tdbb, true);
}
// Lock requires an upward conversion. Sigh. Try to get the conversion.
// If it fails, release the lock and re-seize. Save the contents of the
// status vector just in case
const LockState must_read = (lock->lck_logical < LCK_read) ? lsLocked : lsLockedHavePage;
if (LCK_convert_opt(tdbb, lock, lock_type))
return must_read;
if (wait == LCK_NO_WAIT)
{
bdb->release(tdbb, true);
return lsLockTimeout;
}
if (LCK_lock(tdbb, lock, lock_type, wait))
return lsLocked;
// Case: a timeout was specified, or the caller didn't want to wait, return the error.
if ((wait < 0) && (tempStatus->getErrors()[1] == isc_lock_timeout))
{
bdb->release(tdbb, false);
return lsLockTimeout;
}
// Case: lock manager detected a deadlock, probably caused by locking the
// BufferDesc's in an unfortunate order. Nothing we can do about it, return the
// error, and log it to firebird.log.
FbStatusVector* const status = tempStatus.restore();
fb_msg_format(0, JRD_BUGCHK, 215, sizeof(errmsg), errmsg,
MsgFormat::SafeArg() << bdb->bdb_page.getPageNum() << (int) page_type);
ERR_append_status(status, Arg::Gds(isc_random) << Arg::Str(errmsg));
ERR_log(JRD_BUGCHK, 215, errmsg); // msg 215 page %ld, page type %ld lock conversion denied
CCH_unwind(tdbb, true);
return lsError; // Added to get rid of Compiler Warning
}
static ULONG memory_init(thread_db* tdbb, BufferControl* bcb, SLONG number)
{
/**************************************
*
* m e m o r y _ i n i t
*
**************************************
*
* Functional description
* Initialize memory for the cache.
* Return number of buffers allocated.
*
**************************************/
SET_TDBB(tdbb);
Database* const dbb = tdbb->getDatabase();
UCHAR* memory = NULL;
SLONG buffers = 0;
const size_t page_size = dbb->dbb_page_size;
size_t memory_size = page_size * (number + 1);
fb_assert(memory_size > 0);
SLONG old_buffers = 0;
bcb_repeat* old_tail = NULL;
const UCHAR* memory_end = NULL;
bcb_repeat* tail = bcb->bcb_rpt;
// "end" is changed inside the loop
for (const bcb_repeat* end = tail + number; tail < end; tail++)
{
if (!memory)
{
// Allocate only what is required for remaining buffers.
if (memory_size > (page_size * (number + 1)))
memory_size = page_size * (number + 1);
while (true)
{
try {
memory = (UCHAR*) bcb->bcb_bufferpool->allocate(memory_size ALLOC_ARGS);
break;
}
catch (Firebird::BadAlloc&)
{
// Either there's not enough virtual memory or there is
// but it's not virtually contiguous. Let's find out by
// cutting the size in half to see if the buffers can be
// scattered over the remaining virtual address space.
memory_size >>= 1;
if (memory_size < MIN_BUFFER_SEGMENT)
{
// Diminishing returns
return buffers;
}
}
}
bcb->bcb_memory.push(memory);
memory_end = memory + memory_size;
// Allocate buffers on an address that is an even multiple
// of the page size (rather the physical sector size.) This
// is a necessary condition to support raw I/O interfaces.
memory = FB_ALIGN(memory, page_size);
old_tail = tail;
old_buffers = buffers;
}
QUE_INIT(tail->bcb_page_mod);
try
{
tail->bcb_bdb = alloc_bdb(tdbb, bcb, &memory);
}
catch (Firebird::BadAlloc&)
{
// Whoops! Time to reset our expectations. Release the buffer memory
// but use that memory size to calculate a new number that takes into account
// the page buffer overhead. Reduce this number by a 25% fudge factor to
// leave some memory for useful work.
bcb->bcb_bufferpool->deallocate(bcb->bcb_memory.pop());
memory = NULL;
for (bcb_repeat* tail2 = old_tail; tail2 < tail; tail2++)
tail2->bcb_bdb = dealloc_bdb(tail2->bcb_bdb);
number = static_cast<SLONG>(memory_size / PAGE_OVERHEAD);
number -= number >> 2;
end = old_tail + number;
tail = --old_tail; // For loop continue pops tail above
buffers = old_buffers;
continue;
}
buffers++; // Allocated buffers
number--; // Remaining buffers
// Check if memory segment has been exhausted.
if (memory + page_size > memory_end)
memory = 0;
}
return buffers;
}
static void page_validation_error(thread_db* tdbb, WIN* window, SSHORT type)
{
/**************************************
*
* p a g e _ v a l i d a t i o n _ e r r o r
*
**************************************
*
* Functional description
* We've detected a validation error on fetch. Generally
* we've detected that the type of page fetched didn't match the
* type of page we were expecting. Report an error and
* get out.
* This function will only be called rarely, as a page validation
* error is an indication of on-disk database corruption.
*
**************************************/
SET_TDBB(tdbb);
BufferDesc* bdb = window->win_bdb;
const pag* page = bdb->bdb_buffer;
PageSpace* pages =
tdbb->getDatabase()->dbb_page_manager.findPageSpace(bdb->bdb_page.getPageSpaceID());
ERR_build_status(tdbb->tdbb_status_vector,
Arg::Gds(isc_db_corrupt) << Arg::Str(pages->file->fil_string) <<
Arg::Gds(isc_page_type_err) <<
Arg::Gds(isc_badpagtyp) << Arg::Num(bdb->bdb_page.getPageNum()) <<
pagtype(type) <<
pagtype(page->pag_type));
// We should invalidate this bad buffer.
CCH_unwind(tdbb, true);
}
#ifdef CACHE_READER
static void prefetch_epilogue(Prefetch* prefetch, FbStatusVector* status_vector)
{
/**************************************
*
* p r e f e t c h _ e p i l o g u e
*
**************************************
*
* Functional description
* Stall on asynchronous I/O completion.
* Move data from prefetch buffer to database
* buffers, compute the checksum, and release
* the latch.
*
**************************************/
if (!(prefetch->prf_flags & PRF_active))
return;
thread_db* tdbb = prefetch->prf_tdbb;
Database* dbb = tdbb->getDatabase();
prefetch->prf_piob.piob_wait = TRUE;
const bool async_status = PIO_status(dbb, &prefetch->prf_piob, status_vector);
// If there was an I/O error release all buffer latches acquired for the prefetch request.
if (!async_status)
{
BufferDesc** next_bdb = prefetch->prf_bdbs;
for (USHORT i = 0; i < prefetch->prf_max_prefetch; i++)
{
if (*next_bdb)
release_bdb(tdbb, *next_bdb, true, false, false);
next_bdb++;
}
prefetch->prf_flags &= ~PRF_active;
return;
}
const SCHAR* next_buffer = prefetch->prf_io_buffer;
BufferDesc** next_bdb = prefetch->prf_bdbs;
for (USHORT i = 0; i < prefetch->prf_max_prefetch; i++)
{
if (*next_bdb)
{
pag* page = (*next_bdb)->bdb_buffer;
if (next_buffer != reinterpret_cast<char*>(page))
memcpy(page, next_buffer, (ULONG) dbb->dbb_page_size);
if (page->pag_pageno == (*next_bdb)->bdb_page.getPageNum())
{
(*next_bdb)->bdb_flags &= ~(BDB_read_pending | BDB_not_valid);
(*next_bdb)->bdb_flags |= BDB_prefetch;
}
release_bdb(tdbb, *next_bdb, true, false, false);
}
next_buffer += dbb->dbb_page_size;
next_bdb++;
}
prefetch->prf_flags &= ~PRF_active;
}
static void prefetch_init(Prefetch* prefetch, thread_db* tdbb)
{
/**************************************
*
* p r e f e t c h _ i n i t
*
**************************************
*
* Functional description
* Initialize prefetch data structure.
* Most systems that allow access to "raw" I/O
* interfaces want the buffer address aligned.
*
**************************************/
Database* dbb = tdbb->getDatabase();
prefetch->prf_tdbb = tdbb;
prefetch->prf_flags = 0;
prefetch->prf_max_prefetch = PREFETCH_MAX_TRANSFER / dbb->dbb_page_size;
prefetch->prf_aligned_buffer =
(SCHAR*) (((U_IPTR) &prefetch->prf_unaligned_buffer + MIN_PAGE_SIZE - 1) &
~((U_IPTR) MIN_PAGE_SIZE - 1));
}
static void prefetch_io(Prefetch* prefetch, FbStatusVector* status_vector)
{
/**************************************
*
* p r e f e t c h _ i o
*
**************************************
*
* Functional description
* Queue an asynchronous I/O to read
* multiple pages into prefetch buffer.
*
**************************************/
thread_db* tdbb = prefetch->prf_tdbb;
Database* dbb = tdbb->getDatabase();
if (!prefetch->prf_page_count)
prefetch->prf_flags &= ~PRF_active;
else
{
// Get the cache reader working on our behalf too
if (!(dbb->dbb_bcb->bcb_flags & BCB_reader_active))
dbb->dbb_reader_sem.post();
const bool async_status =
PIO_read_ahead(dbb, prefetch->prf_start_page, prefetch->prf_io_buffer,
prefetch->prf_page_count, &prefetch->prf_piob, status_vector);
if (!async_status)
{
BufferDesc** next_bdb = prefetch->prf_bdbs;
for (USHORT i = 0; i < prefetch->prf_max_prefetch; i++)
{
if (*next_bdb)
release_bdb(tdbb, *next_bdb, true, false, false);
next_bdb++;
}
prefetch->prf_flags &= ~PRF_active;
}
}
}
static void prefetch_prologue(Prefetch* prefetch, SLONG* start_page)
{
/**************************************
*
* p r e f e t c h _ p r o l o g u e
*
**************************************
*
* Functional description
* Search for consecutive pages to be prefetched
* and latch them for I/O.
*
**************************************/
thread_db* tdbb = prefetch->prf_tdbb;
Database* dbb = tdbb->getDatabase();
BufferControl* bcb = dbb->dbb_bcb;
prefetch->prf_start_page = *start_page;
prefetch->prf_page_count = 0;
prefetch->prf_flags |= PRF_active;
BufferDesc** next_bdb = prefetch->prf_bdbs;
for (USHORT i = 0; i < prefetch->prf_max_prefetch; i++)
{
*next_bdb = 0;
if (SBM_clear(bcb->bcb_prefetch, *start_page) &&
(*next_bdb = get_buffer(tdbb, *start_page, LATCH_shared, 0)))
{
if ((*next_bdb)->bdb_flags & BDB_read_pending)
prefetch->prf_page_count = i + 1;
else
{
release_bdb(tdbb, *next_bdb, true, false, false);
*next_bdb = 0;
}
}
next_bdb++;
(*start_page)++;
}
// Optimize non-sequential list prefetching to transfer directly to database buffers.
BufferDesc* bdb;
if (prefetch->prf_page_count == 1 && (bdb = prefetch->prf_bdbs[0]))
prefetch->prf_io_buffer = reinterpret_cast<char*>(bdb->bdb_buffer);
else
prefetch->prf_io_buffer = prefetch->prf_aligned_buffer;
// Reset starting page for next bitmap walk
--(*start_page);
}
#endif // CACHE_READER
static SSHORT related(BufferDesc* low, const BufferDesc* high, SSHORT limit, const ULONG mark)
{
/**************************************
*
* r e l a t e d
*
**************************************
*
* Functional description
* See if there are precedence relationships linking two buffers.
* Since precedence graphs can become very complex, limit search for
* precedence relationship by visiting a presribed limit of higher
* precedence blocks.
*
**************************************/
const struct que* base = &low->bdb_higher;
for (const struct que* que_inst = base->que_forward; que_inst != base; que_inst = que_inst->que_forward)
{
if (!--limit)
return PRE_UNKNOWN;
const Precedence* precedence = BLOCK(que_inst, Precedence, pre_higher);
if (!(precedence->pre_flags & PRE_cleared))
{
if (precedence->pre_hi->bdb_prec_walk_mark == mark)
continue;
if (precedence->pre_hi == high)
return PRE_EXISTS;
if (QUE_NOT_EMPTY(precedence->pre_hi->bdb_higher))
{
limit = related(precedence->pre_hi, high, limit, mark);
if (limit == PRE_EXISTS || limit == PRE_UNKNOWN)
return limit;
}
else
precedence->pre_hi->bdb_prec_walk_mark = mark;
}
}
low->bdb_prec_walk_mark = mark;
return limit;
}
static inline bool writeable(BufferDesc* bdb)
{
/**************************************
*
* w r i t e a b l e
*
**************************************
*
* Functional description
* See if a buffer is writeable. A buffer is writeable if
* neither it nor any of it's higher precedence cousins are
* marked for write.
* This is the starting point of recursive walk of precedence
* graph. The writeable_mark member is used to mark already seen
* buffers to avoid repeated walk of the same sub-graph.
* Currently this function can't be called from more than one
* thread simultaneously. When SMP will be implemented we must
* take additional care about thread-safety.
*
**************************************/
if (bdb->bdb_flags & BDB_marked)
return false;
if (QUE_EMPTY(bdb->bdb_higher))
return true;
BufferControl* bcb = bdb->bdb_bcb;
Sync syncPrec(&bcb->bcb_syncPrecedence, "writeable");
syncPrec.lock(SYNC_EXCLUSIVE);
const ULONG mark = get_prec_walk_mark(bcb);
return is_writeable(bdb, mark);
}
static bool is_writeable(BufferDesc* bdb, const ULONG mark)
{
/**************************************
*
* i s _ w r i t e a b l e
*
**************************************
*
* Functional description
* See if a buffer is writeable. A buffer is writeable if
* neither it nor any of it's higher precedence cousins are
* marked for write.
*
**************************************/
// If there are buffers that must be written first, check them, too.
for (const que* queue = bdb->bdb_higher.que_forward;
queue != &bdb->bdb_higher; queue = queue->que_forward)
{
const Precedence* precedence = BLOCK(queue, Precedence, pre_higher);
if (!(precedence->pre_flags & PRE_cleared))
{
BufferDesc* high = precedence->pre_hi;
if (high->bdb_flags & BDB_marked)
return false;
if (high->bdb_prec_walk_mark != mark)
{
if (QUE_EMPTY(high->bdb_higher))
high->bdb_prec_walk_mark = mark;
else if (!is_writeable(high, mark))
return false;
}
}
}
bdb->bdb_prec_walk_mark = mark;
return true;
}
static int write_buffer(thread_db* tdbb,
BufferDesc* bdb,
const PageNumber page,
const bool write_thru,
FbStatusVector* const status, const bool write_this_page)
{
/**************************************
*
* w r i t e _ b u f f e r
*
**************************************
*
* Functional description
* Write a dirty buffer. This may recurse due to
* precedence problems.
*
* input: write_this_page
* = true if the input page needs to be written
* before returning. (normal case)
* = false if the input page is being written
* because of precedence. Only write
* one page and return so that the caller
* can re-establish the need to write this
* page.
*
* return: 0 = Write failed.
* 1 = Page is written. Page was written by this
* call, or was written by someone else, or the
* cache buffer is already reassigned.
* 2 = Only possible if write_this_page is false.
* This input page is not written. One
* page higher in precedence is written
* though. Probable action: re-establich the
* need to write this page and retry write.
*
**************************************/
SET_TDBB(tdbb);
#ifdef SUPERSERVER_V2
Database* const dbb = tdbb->getDatabase();
#endif
bdb->lockIO(tdbb);
if (bdb->bdb_page != page)
{
bdb->unLockIO(tdbb);
return 1;
}
if ((bdb->bdb_flags & BDB_marked) && !(bdb->bdb_flags & BDB_faked))
BUGCHECK(217); // msg 217 buffer marked for update
if (!(bdb->bdb_flags & BDB_dirty) && !(write_thru && bdb->bdb_flags & BDB_db_dirty))
{
bdb->unLockIO(tdbb);
clear_precedence(tdbb, bdb);
return 1;
}
// If there are buffers that must be written first, write them now.
BufferControl *bcb = bdb->bdb_bcb;
if (QUE_NOT_EMPTY(bdb->bdb_higher))
{
Sync syncPrec(&bcb->bcb_syncPrecedence, "write_buffer");
while (true)
{
syncPrec.lock(SYNC_EXCLUSIVE);
if (QUE_EMPTY(bdb->bdb_higher))
{
syncPrec.unlock();
break;
}
QUE que_inst = bdb->bdb_higher.que_forward;
Precedence* precedence = BLOCK(que_inst, Precedence, pre_higher);
if (precedence->pre_flags & PRE_cleared)
{
QUE_DELETE(precedence->pre_higher);
QUE_DELETE(precedence->pre_lower);
precedence->pre_hi = (BufferDesc*) bcb->bcb_free;
bcb->bcb_free = precedence;
syncPrec.unlock();
}
else
{
bdb->unLockIO(tdbb);
BufferDesc* hi_bdb = precedence->pre_hi;
const PageNumber hi_page = hi_bdb->bdb_page;
int write_status = 0;
syncPrec.unlock();
write_status = write_buffer(tdbb, hi_bdb, hi_page, write_thru, status, false);
if (write_status == 0)
return 0; // return IO error
if (!write_this_page)
return 2; // caller wants to re-establish the need for this write after one precedence write
bdb->lockIO(tdbb);
if (bdb->bdb_page != page)
{
bdb->unLockIO(tdbb);
return 1;
}
}
}
}
#ifdef SUPERSERVER_V2
// Header page I/O is deferred until a dirty page, which was modified by a
// transaction not updated on the header page, needs to be written. Note
// that the header page needs to be written with the target page latched to
// prevent younger transactions from modifying the target page.
if (page != HEADER_PAGE_NUMBER && bdb->bdb_mark_transaction > dbb->dbb_last_header_write)
{
TRA_header_write(tdbb, dbb, bdb->bdb_mark_transaction);
}
#endif
// Unless the buffer has been faked (recently re-allocated), write out the page
bool result = true;
if ((bdb->bdb_flags & BDB_dirty || (write_thru && bdb->bdb_flags & BDB_db_dirty)) &&
!(bdb->bdb_flags & BDB_marked))
{
result = write_page(tdbb, bdb, status, false);
}
bdb->unLockIO(tdbb);
if (result)
clear_precedence(tdbb, bdb);
if (!result)
return 0;
if (!write_this_page)
return 2;
return 1;
}
static bool write_page(thread_db* tdbb, BufferDesc* bdb, FbStatusVector* const status, const bool inAst)
{
/**************************************
*
* w r i t e _ p a g e
*
**************************************
*
* Functional description
* Do actions required when writing a database page,
* including journaling, shadowing.
*
**************************************/
CCH_TRACE(("WRITE %d:%06d", bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum()));
// hvlad: why it is needed in Vulcan ???
//Sync syncWrite(&bcb->bcb_syncPageWrite, "write_page");
//syncWrite.lock(SYNC_EXCLUSIVE);
if (bdb->bdb_flags & BDB_not_valid)
{
ERR_build_status(status, Arg::Gds(isc_buf_invalid) << Arg::Num(bdb->bdb_page.getPageNum()));
return false;
}
Database* const dbb = tdbb->getDatabase();
pag* const page = bdb->bdb_buffer;
// Before writing db header page, make sure that
// the next_transaction > oldest_active transaction
if (bdb->bdb_page == HEADER_PAGE_NUMBER)
{
const header_page* const header = (header_page*) page;
const TraNumber next_transaction = Ods::getNT(header);
const TraNumber oldest_active = Ods::getOAT(header);
const TraNumber oldest_transaction = Ods::getOIT(header);
if (next_transaction)
{
if (oldest_active > next_transaction)
BUGCHECK(266); // next transaction older than oldest active
if (oldest_transaction > next_transaction)
BUGCHECK(267); // next transaction older than oldest transaction
}
}
page->pag_generation++;
bool result = true;
//if (!dbb->dbb_wal || write_thru) becomes
//if (true || write_thru) then finally if (true)
// I won't wipe out the if() itself to allow my changes be verified easily by others
if (true)
{
tdbb->bumpStats(RuntimeStatistics::PAGE_WRITES);
// write out page to main database file, and to any
// shadows, making a special case of the header page
BackupManager* bm = dbb->dbb_backup_manager;
const int backup_state = bm->getState();
/// ASF: Always true: if (bdb->bdb_page.getPageNum() >= 0)
{
fb_assert(backup_state != Ods::hdr_nbak_unknown);
page->pag_pageno = bdb->bdb_page.getPageNum();
#ifdef NBAK_DEBUG
// We cannot call normal trace functions here as they are signal-unsafe
// "Write page=%d, dir=%d, diff=%d, scn=%d"
char buffer[1000], *ptr = buffer;
strcpy(ptr, "NBAK, Write page ");
ptr += strlen(ptr);
gds__ulstr(ptr, bdb->bdb_page.getPageNum(), 0, 0);
ptr += strlen(ptr);
strcpy(ptr, ", backup_state=");
ptr += strlen(ptr);
gds__ulstr(ptr, backup_state, 0, 0);
ptr += strlen(ptr);
strcpy(ptr, ", diff=");
ptr += strlen(ptr);
gds__ulstr(ptr, bdb->bdb_difference_page, 0, 0);
ptr += strlen(ptr);
strcpy(ptr, ", scn=");
ptr += strlen(ptr);
gds__ulstr(ptr, bdb->bdb_buffer->pag_scn, 0, 0);
ptr += strlen(ptr);
gds__trace(buffer);
#endif
PageSpace* pageSpace =
dbb->dbb_page_manager.findPageSpace(bdb->bdb_page.getPageSpaceID());
fb_assert(pageSpace);
const bool isTempPage = pageSpace->isTemporary();
if (!isTempPage &&
(backup_state == Ods::hdr_nbak_stalled ||
(backup_state == Ods::hdr_nbak_merge && bdb->bdb_difference_page)))
{
if (!dbb->dbb_backup_manager->writeDifference(tdbb, status,
bdb->bdb_difference_page, bdb->bdb_buffer))
{
bdb->bdb_flags |= BDB_io_error;
dbb->dbb_flags |= DBB_suspend_bgio;
return false;
}
}
if (!isTempPage && backup_state == Ods::hdr_nbak_stalled)
{
// We finished. Adjust transaction accounting and get ready for exit
if (bdb->bdb_page == HEADER_PAGE_NUMBER)
dbb->dbb_last_header_write = Ods::getNT((header_page*) page);
}
else
{
// We need to write our pages to main database files
class Pio : public CryptoManager::IOCallback
{
public:
Pio(jrd_file* f, BufferDesc* b, bool ast, bool tp, PageSpace* ps)
: file(f), bdb(b), inAst(ast), isTempPage(tp), pageSpace(ps)
{ }
bool callback(thread_db* tdbb, FbStatusVector* status, Ods::pag* page)
{
Database* dbb = tdbb->getDatabase();
while (!PIO_write(tdbb, file, bdb, page, status))
{
if (isTempPage || !CCH_rollover_to_shadow(tdbb, dbb, file, inAst))
{
bdb->bdb_flags |= BDB_io_error;
dbb->dbb_flags |= DBB_suspend_bgio;
return false;
}
file = pageSpace->file;
}
if (bdb->bdb_page == HEADER_PAGE_NUMBER)
dbb->dbb_last_header_write = Ods::getNT((header_page*) page);
if (dbb->dbb_shadow && !isTempPage)
return CCH_write_all_shadows(tdbb, 0, bdb, page, status, inAst);
return true;
}
private:
jrd_file* file;
BufferDesc* bdb;
bool inAst;
bool isTempPage;
PageSpace* pageSpace;
};
Pio io(pageSpace->file, bdb, inAst, isTempPage, pageSpace);
result = dbb->dbb_crypto_manager->write(tdbb, status, page, &io);
if (!result && (bdb->bdb_flags & BDB_io_error))
{
return false;
}
}
}
if (result)
bdb->bdb_flags &= ~BDB_db_dirty;
}
if (!result)
{
// If there was a write error then idle background threads
// so that they don't spin trying to write these pages. This
// usually results from device full errors which can be fixed
// by someone freeing disk space.
bdb->bdb_flags |= BDB_io_error;
dbb->dbb_flags |= DBB_suspend_bgio;
}
else
{
// clear the dirty bit vector, since the buffer is now
// clean regardless of which transactions have modified it
// Destination difference page number is only valid between MARK and
// write_page so clean it now to avoid confusion
bdb->bdb_difference_page = 0;
bdb->bdb_transactions = 0;
bdb->bdb_mark_transaction = 0;
if (!(bdb->bdb_bcb->bcb_flags & BCB_keep_pages))
removeDirty(bdb->bdb_bcb, bdb);
bdb->bdb_flags &= ~(BDB_must_write | BDB_system_dirty);
clear_dirty_flag_and_nbak_state(tdbb, bdb);
if (bdb->bdb_flags & BDB_io_error)
{
// If a write error has cleared, signal background threads
// to resume their regular duties. If someone has freed up
// disk space these errors will spontaneously go away.
bdb->bdb_flags &= ~BDB_io_error;
dbb->dbb_flags &= ~DBB_suspend_bgio;
}
}
return result;
}
static void clear_dirty_flag_and_nbak_state(thread_db* tdbb, BufferDesc* bdb)
{
const AtomicCounter::counter_type oldFlags = bdb->bdb_flags.exchangeBitAnd(
~(BDB_dirty | BDB_nbak_state_lock));
if (oldFlags & BDB_nbak_state_lock)
{
NBAK_TRACE(("unlock state for dirty page %d:%06d",
bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum()));
tdbb->getDatabase()->dbb_backup_manager->unlockStateRead(tdbb);
}
else if ((oldFlags & BDB_dirty) && bdb->bdb_page != HEADER_PAGE_NUMBER)
fb_assert(PageSpace::isTemporary(bdb->bdb_page.getPageSpaceID()));
}
void recentlyUsed(BufferDesc* bdb)
{
const AtomicCounter::counter_type oldFlags = bdb->bdb_flags.exchangeBitOr(BDB_lru_chained);
if (oldFlags & BDB_lru_chained)
return;
BufferControl* bcb = bdb->bdb_bcb;
#ifdef DEV_BUILD
volatile BufferDesc* chain = bcb->bcb_lru_chain;
for (; chain; chain = chain->bdb_lru_chain)
{
if (chain == bdb)
BUGCHECK(-1); // !!
}
#endif
for (;;)
{
bdb->bdb_lru_chain = bcb->bcb_lru_chain;
if (bcb->bcb_lru_chain.compareExchange(bdb->bdb_lru_chain, bdb))
break;
}
}
void requeueRecentlyUsed(BufferControl* bcb)
{
volatile BufferDesc* chain = NULL;
// Let's pick up the LRU pending chain, if any
for (;;)
{
chain = bcb->bcb_lru_chain;
if (bcb->bcb_lru_chain.compareExchange((BufferDesc*) chain, NULL))
break;
}
if (!chain)
return;
// Next, let's flip the order
BufferDesc* reversed = NULL;
BufferDesc* bdb;
while ( (bdb = (BufferDesc*) chain) )
{
chain = bdb->bdb_lru_chain;
bdb->bdb_lru_chain = reversed;
reversed = bdb;
}
while ( (bdb = reversed) )
{
reversed = bdb->bdb_lru_chain;
QUE_DELETE (bdb->bdb_in_use);
QUE_INSERT (bcb->bcb_in_use, bdb->bdb_in_use);
bdb->bdb_flags &= ~BDB_lru_chained;
bdb->bdb_lru_chain = NULL;
}
chain = bcb->bcb_lru_chain;
}
BufferControl* BufferControl::create(Database* dbb)
{
MemoryPool* const pool = dbb->createPool();
BufferControl* const bcb = FB_NEW_POOL(*pool) BufferControl(*pool, dbb->dbb_memory_stats);
pool->setStatsGroup(bcb->bcb_memory_stats);
return bcb;
}
void BufferControl::destroy(BufferControl* bcb)
{
Database* const dbb = bcb->bcb_database;
MemoryPool* const pool = bcb->bcb_bufferpool;
MemoryStats temp_stats;
pool->setStatsGroup(temp_stats);
delete bcb;
dbb->deletePool(pool);
}
bool BufferDesc::addRef(thread_db* tdbb, SyncType syncType, int wait)
{
if (wait == 1)
bdb_syncPage.lock(NULL, syncType, FB_FUNCTION);
else if (!bdb_syncPage.lock(NULL, syncType, FB_FUNCTION, -wait * 1000))
return false;
++bdb_use_count;
if (syncType == SYNC_EXCLUSIVE)
{
bdb_exclusive = tdbb;
++bdb_writers;
}
tdbb->registerBdb(this);
return true;
}
bool BufferDesc::addRefConditional(thread_db* tdbb, SyncType syncType)
{
if (!bdb_syncPage.lockConditional(syncType, FB_FUNCTION))
return false;
++bdb_use_count;
if (syncType == SYNC_EXCLUSIVE)
{
bdb_exclusive = tdbb;
++bdb_writers;
}
tdbb->registerBdb(this);
return true;
}
void BufferDesc::downgrade(SyncType syncType)
{
if (syncType == SYNC_SHARED && !bdb_writers)
return;
if (bdb_writers != 1)
BUGCHECK(296); // inconsistent latch downgrade call
--bdb_writers;
bdb_exclusive = NULL;
bdb_syncPage.downgrade(syncType);
}
void BufferDesc::release(thread_db* tdbb, bool repost)
{
//const SyncType oldState = bdb_syncPage.getState(); Unfinished logic here???
fb_assert(!(bdb_flags & BDB_marked) || bdb_writers > 1);
if (!tdbb->clearBdb(this))
return;
--bdb_use_count;
if (bdb_writers)
{
if (--bdb_writers == 0)
bdb_exclusive = NULL;
bdb_syncPage.unlock(NULL, SYNC_EXCLUSIVE);
}
else
bdb_syncPage.unlock(NULL, SYNC_SHARED);
if (repost && !isLocked() && (bdb_ast_flags & BDB_blocking))
{
PAGE_LOCK_RE_POST(tdbb, bdb_bcb, bdb_lock);
}
}
void BufferDesc::lockIO(thread_db* tdbb)
{
bdb_syncIO.lock(NULL, SYNC_EXCLUSIVE, FB_FUNCTION);
fb_assert(!bdb_io_locks && bdb_io != tdbb || bdb_io_locks && bdb_io == tdbb);
bdb_io = tdbb;
bdb_io->registerBdb(this);
++bdb_io_locks;
++bdb_use_count;
}
void BufferDesc::unLockIO(thread_db* tdbb)
{
fb_assert(bdb_io && bdb_io == tdbb);
fb_assert(bdb_io_locks > 0);
if (!bdb_io->clearBdb(this))
return;
--bdb_use_count;
if (--bdb_io_locks == 0)
bdb_io = NULL;
bdb_syncIO.unlock(NULL, SYNC_EXCLUSIVE);
}