/* * PROGRAM: JRD Access Method * MODULE: cch.cpp * DESCRIPTION: Disk cache manager * * The contents of this file are subject to the Interbase Public * License Version 1.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy * of the License at http://www.Inprise.com/IPL.html * * Software distributed under the License is distributed on an * "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express * or implied. See the License for the specific language governing * rights and limitations under the License. * * The Original Code was created by Inprise Corporation * and its predecessors. Portions created by Inprise Corporation are * Copyright (C) Inprise Corporation. * * All Rights Reserved. * Contributor(s): ______________________________________. * 2001.07.06 Sean Leyne - Code Cleanup, removed "#ifdef READONLY_DATABASE" * conditionals, as the engine now fully supports * readonly databases. * * 2002.10.29 Sean Leyne - Removed obsolete "Netware" port * */ #include "firebird.h" #include #include #include #include "../jrd/jrd.h" #include "../jrd/que.h" #include "../jrd/lck.h" #include "../jrd/ods.h" #include "../jrd/os/pio.h" #include "../jrd/cch.h" #include "gen/iberror.h" #include "../jrd/lls.h" #include "../jrd/sdw.h" #include "../jrd/tra.h" #include "../jrd/sbm.h" #include "../jrd/nbak.h" #include "../common/gdsassert.h" #include "../jrd/cch_proto.h" #include "../jrd/err_proto.h" #include "../yvalve/gds_proto.h" #include "../common/isc_proto.h" #include "../common/isc_s_proto.h" #include "../jrd/jrd_proto.h" #include "../jrd/lck_proto.h" #include "../jrd/pag_proto.h" #include "../jrd/ods_proto.h" #include "../jrd/os/pio_proto.h" #include "../jrd/sdw_proto.h" #include "../jrd/shut_proto.h" #include "../common/ThreadStart.h" #include "../jrd/tra_proto.h" #include "../common/config/config.h" #include "../common/classes/ClumpletWriter.h" #include "../common/classes/MsgPrint.h" #include "../jrd/CryptoManager.h" #include "../common/utils_proto.h" using namespace Jrd; using namespace Ods; using namespace Firebird; // In the superserver mode, no page locks are acquired through the lock manager. // Instead, a latching mechanism is used. So the calls to lock subsystem for // database pages in the original code should not be made, lest they should cause // any undesirable side-effects. The following defines help us achieve that. #ifdef CCH_DEBUG #include IMPLEMENT_TRACE_ROUTINE(cch_trace, "CCH") #endif #ifdef SUPERSERVER_V2 #define CACHE_READER #endif static inline void PAGE_LOCK_RELEASE(thread_db* tdbb, BufferControl* bcb, Lock* lock) { if (!(bcb->bcb_flags & BCB_exclusive)) { CCH_TRACE(("LCK RLS %" SQUADFORMAT, lock->getKey())); LCK_release(tdbb, lock); } } static inline void PAGE_LOCK_ASSERT(thread_db* tdbb, BufferControl* bcb, Lock* lock) { if (!(bcb->bcb_flags & BCB_exclusive)) LCK_assert(tdbb, lock); } static inline void PAGE_LOCK_RE_POST(thread_db* tdbb, BufferControl* bcb, Lock* lock) { if (!(bcb->bcb_flags & BCB_exclusive)) { CCH_TRACE(("LCK REP %" SQUADFORMAT, lock->getKey())); LCK_re_post(tdbb, lock); } } #define PAGE_OVERHEAD (sizeof(bcb_repeat) + sizeof(BufferDesc) + sizeof(Lock) + (int) bcb->bcb_page_size) enum LatchState { lsOk, lsTimeout, lsPageChanged }; static void adjust_scan_count(WIN* window, bool mustRead); static BufferDesc* alloc_bdb(thread_db*, BufferControl*, UCHAR **); static Lock* alloc_page_lock(Jrd::thread_db*, BufferDesc*); static int blocking_ast_bdb(void*); #ifdef CACHE_READER static void prefetch_epilogue(Prefetch*, FbStatusVector *); static void prefetch_init(Prefetch*, thread_db*); static void prefetch_io(Prefetch*, FbStatusVector *); static void prefetch_prologue(Prefetch*, SLONG *); #endif static void check_precedence(thread_db*, WIN*, PageNumber); static void clear_precedence(thread_db*, BufferDesc*); static BufferDesc* dealloc_bdb(BufferDesc*); static void down_grade(thread_db*, BufferDesc*, int high = 0); static bool expand_buffers(thread_db*, ULONG); static BufferDesc* find_buffer(BufferControl* bcb, const PageNumber page, bool findPending); static BufferDesc* get_buffer(thread_db*, const PageNumber, SyncType, int); static int get_related(BufferDesc*, PagesArray&, int, const ULONG); static ULONG get_prec_walk_mark(BufferControl*); static LatchState latch_buffer(thread_db*, Sync&, BufferDesc*, const PageNumber, SyncType, int); static LockState lock_buffer(thread_db*, BufferDesc*, const SSHORT, const SCHAR); static ULONG memory_init(thread_db*, BufferControl*, SLONG); static void page_validation_error(thread_db*, win*, SSHORT); static SSHORT related(BufferDesc*, const BufferDesc*, SSHORT, const ULONG); static bool writeable(BufferDesc*); static bool is_writeable(BufferDesc*, const ULONG); static int write_buffer(thread_db*, BufferDesc*, const PageNumber, const bool, FbStatusVector* const, const bool); static bool write_page(thread_db*, BufferDesc*, FbStatusVector* const, const bool); static bool set_diff_page(thread_db*, BufferDesc*); static void clear_dirty_flag_and_nbak_state(thread_db*, BufferDesc*); static inline void insertDirty(BufferControl* bcb, BufferDesc* bdb) { if (bdb->bdb_dirty.que_forward != &bdb->bdb_dirty) return; Sync dirtySync(&bcb->bcb_syncDirtyBdbs, "insertDirty"); dirtySync.lock(SYNC_EXCLUSIVE); if (bdb->bdb_dirty.que_forward != &bdb->bdb_dirty) return; bcb->bcb_dirty_count++; QUE_INSERT(bcb->bcb_dirty, bdb->bdb_dirty); } static inline void removeDirty(BufferControl* bcb, BufferDesc* bdb) { if (bdb->bdb_dirty.que_forward == &bdb->bdb_dirty) return; Sync dirtySync(&bcb->bcb_syncDirtyBdbs, "removeDirty"); dirtySync.lock(SYNC_EXCLUSIVE); if (bdb->bdb_dirty.que_forward == &bdb->bdb_dirty) return; fb_assert(bcb->bcb_dirty_count > 0); bcb->bcb_dirty_count--; QUE_DELETE(bdb->bdb_dirty); QUE_INIT(bdb->bdb_dirty); } static void flushDirty(thread_db* tdbb, SLONG transaction_mask, const bool sys_only); static void flushAll(thread_db* tdbb, USHORT flush_flag); static void flushPages(thread_db* tdbb, USHORT flush_flag, BufferDesc** begin, FB_SIZE_T count); static void recentlyUsed(BufferDesc* bdb); static void requeueRecentlyUsed(BufferControl* bcb); const ULONG MIN_BUFFER_SEGMENT = 65536; // Given pointer a field in the block, find the block #define BLOCK(fld_ptr, type, fld) (type*)((SCHAR*) fld_ptr - offsetof(type, fld)) const PageNumber FREE_PAGE(DB_PAGE_SPACE, -1); const int PRE_SEARCH_LIMIT = 256; const int PRE_EXISTS = -1; const int PRE_UNKNOWN = -2; int CCH_down_grade_dbb(void* ast_object) { /************************************** * * C C H _ d o w n _ g r a d e _ d b b * ************************************** * * Functional description * Down grade the lock on the database in response to a blocking * AST. * **************************************/ Database* dbb = static_cast(ast_object); try { Lock* const lock = dbb->dbb_lock; AsyncContextHolder tdbb(dbb, FB_FUNCTION); SyncLockGuard dsGuard(&dbb->dbb_sync, SYNC_EXCLUSIVE, "CCH_down_grade_dbb"); dbb->dbb_ast_flags |= DBB_blocking; // Process the database shutdown request, if any if (SHUT_blocking_ast(tdbb, true)) return 0; // If we are already shared, there is nothing more we can do. // If any case, the other guy probably wants exclusive access, // and we can't give it anyway if ((lock->lck_logical == LCK_SW) || (lock->lck_logical == LCK_SR)) { // Fake conversion to the same level as we already own. // This trick re-enables the AST delivery. LCK_convert(tdbb, lock, lock->lck_logical, LCK_NO_WAIT); return 0; } if (dbb->dbb_flags & DBB_bugcheck) { LCK_convert(tdbb, lock, LCK_SW, LCK_WAIT); dbb->dbb_ast_flags &= ~DBB_blocking; return 0; } // If we are supposed to be exclusive, stay exclusive if ((dbb->dbb_flags & DBB_exclusive) || (dbb->dbb_ast_flags & DBB_shutdown_single)) return 0; // Assert any page locks that have been requested, but not asserted dbb->dbb_ast_flags |= DBB_assert_locks; BufferControl* bcb = dbb->dbb_bcb; if (bcb) { SyncLockGuard bcbSync(&bcb->bcb_syncObject, SYNC_EXCLUSIVE, "CCH_down_grade_dbb"); bcb->bcb_flags &= ~BCB_exclusive; bool done = (bcb->bcb_count == 0); while (!done) { done = true; const bcb_repeat* const head = bcb->bcb_rpt; const bcb_repeat* tail = bcb->bcb_rpt; fb_assert(tail); // once I've got here with NULL. AP. for (const bcb_repeat* const end = tail + bcb->bcb_count; tail < end; ++tail) { BufferDesc* bdb = tail->bcb_bdb; // Acquire EX latch to avoid races with LCK_release (called by CCH_release) // or LCK_lock (by lock_buffer) in main thread. Take extra care to avoid // deadlock with CCH_handoff. See CORE-5436. Sync sync(&bdb->bdb_syncPage, FB_FUNCTION); while (!sync.lockConditional(SYNC_EXCLUSIVE)) { SyncUnlockGuard bcbUnlock(bcbSync); Thread::sleep(1); } if (head != bcb->bcb_rpt) { // expand_buffers or CCH_fini was called, consider to start all over again done = (bcb->bcb_count == 0); break; } PAGE_LOCK_ASSERT(tdbb, bcb, bdb->bdb_lock); } } } // Down grade the lock on the database itself if (lock->lck_physical == LCK_EX) LCK_convert(tdbb, lock, LCK_PW, LCK_WAIT); // This lets waiting cache manager in first else if (lock->lck_physical == LCK_PW) LCK_convert(tdbb, lock, LCK_SW, LCK_WAIT); else fb_assert(lock->lck_physical == 0); dbb->dbb_ast_flags &= ~DBB_blocking; } catch (const Firebird::Exception&) {} // no-op return 0; } bool CCH_exclusive(thread_db* tdbb, USHORT level, SSHORT wait_flag, Firebird::Sync* guard) { /************************************** * * C C H _ e x c l u s i v e * ************************************** * * Functional description * Get exclusive access to a database. If we get it, return true. * If the wait flag is FALSE, and we can't get it, give up and * return false. There are two levels of database exclusivity: LCK_PW * guarantees there are no normal users in the database while LCK_EX * additionally guarantees background database processes like the * shared cache manager have detached. * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); if (dbb->dbb_flags & DBB_shared) { if (!CCH_exclusive_attachment(tdbb, level, wait_flag, guard)) return false; } Lock* lock = dbb->dbb_lock; if (!lock) return false; dbb->dbb_flags |= DBB_exclusive; switch (level) { case LCK_PW: if (lock->lck_physical >= LCK_PW || LCK_convert(tdbb, lock, LCK_PW, wait_flag)) return true; break; case LCK_EX: if (lock->lck_physical == LCK_EX || LCK_convert(tdbb, lock, LCK_EX, wait_flag)) return true; break; default: break; } // Clear the status vector, as our callers check the return value // and throw custom exceptions themselves fb_utils::init_status(tdbb->tdbb_status_vector); // If we are supposed to wait (presumably patiently), // but can't get the lock, generate an error if (wait_flag == LCK_WAIT) ERR_post(Arg::Gds(isc_deadlock)); dbb->dbb_flags &= ~DBB_exclusive; return false; } bool CCH_exclusive_attachment(thread_db* tdbb, USHORT level, SSHORT wait_flag, Sync* exGuard) { /************************************** * * C C H _ e x c l u s i v e _ a t t a c h m e n t * ************************************** * * Functional description * Get exclusive access to a database. If we get it, return true. * If the wait flag is FALSE, and we can't get it, give up and * return false. * **************************************/ const int CCH_EXCLUSIVE_RETRY_INTERVAL = 1; // retry interval in seconds SET_TDBB(tdbb); Database* const dbb = tdbb->getDatabase(); Sync dsGuard(&dbb->dbb_sync, FB_FUNCTION); const bool exLock = dbb->dbb_sync.ourExclusiveLock(); if (!exLock) dsGuard.lock(level != LCK_none ? SYNC_EXCLUSIVE : SYNC_SHARED); else fb_assert(exGuard); Jrd::Attachment* const attachment = tdbb->getAttachment(); if (attachment->att_flags & ATT_exclusive) return true; attachment->att_flags |= (level == LCK_none) ? ATT_attach_pending : ATT_exclusive_pending; const SLONG timeout = (wait_flag == LCK_WAIT) ? 1L << 30 : -wait_flag; // If requesting exclusive database access, then re-position attachment as the // youngest so that pending attachments may pass. if (level != LCK_none) { for (Jrd::Attachment** ptr = &dbb->dbb_attachments; *ptr; ptr = &(*ptr)->att_next) { if (*ptr == attachment) { *ptr = attachment->att_next; break; } } attachment->att_next = dbb->dbb_attachments; dbb->dbb_attachments = attachment; if (!exLock) dsGuard.downgrade(SYNC_SHARED); } for (SLONG remaining = timeout; remaining >= 0; remaining -= CCH_EXCLUSIVE_RETRY_INTERVAL) { try { tdbb->checkCancelState(true); bool found = false; for (Jrd::Attachment* other_attachment = attachment->att_next; other_attachment; other_attachment = other_attachment->att_next) { if (level == LCK_none) { // Wait for other attachments requesting exclusive access if (other_attachment->att_flags & (ATT_exclusive | ATT_exclusive_pending)) { found = true; break; } // Forbid multiple attachments in single-user maintenance mode if (other_attachment != attachment && (dbb->dbb_ast_flags & DBB_shutdown_single)) { found = true; break; } } else { // Requesting exclusive database access found = true; if (other_attachment->att_flags & ATT_exclusive_pending) { if (wait_flag == LCK_WAIT) ERR_post(Arg::Gds(isc_deadlock)); attachment->att_flags &= ~ATT_exclusive_pending; return false; } break; } } if (!found) { if (level != LCK_none) attachment->att_flags |= ATT_exclusive; attachment->att_flags &= ~(ATT_exclusive_pending | ATT_attach_pending); return true; } // Our thread needs to sleep for CCH_EXCLUSIVE_RETRY_INTERVAL seconds. if (remaining >= CCH_EXCLUSIVE_RETRY_INTERVAL) { SyncUnlockGuard unlock(exLock ? (*exGuard) : dsGuard); Thread::sleep(CCH_EXCLUSIVE_RETRY_INTERVAL * 1000); } } // try catch (const Exception&) { attachment->att_flags &= ~(ATT_exclusive_pending | ATT_attach_pending); throw; } } attachment->att_flags &= ~(ATT_exclusive_pending | ATT_attach_pending); return false; } bool CCH_expand(thread_db* tdbb, ULONG number) { /************************************** * * C C H _ e x p a n d * ************************************** * * Functional description * Expand the cache to at least a given number of buffers. If * it's already that big, don't do anything. * **************************************/ SET_TDBB(tdbb); return expand_buffers(tdbb, number); } pag* CCH_fake(thread_db* tdbb, WIN* window, int wait) { /************************************** * * C C H _ f a k e * ************************************** * * Functional description * Fake a fetch to a page. Rather than reading it, however, * zero it in memory. This is used when allocating a new page. * * input * wait: 1 => Wait as long as necessary to get the latch. * This can cause deadlocks of course. * 0 => If the latch can't be acquired immediately, * or an IO would be necessary, then give * up and return 0. * => Latch timeout interval in seconds. * * return * pag pointer if successful. * NULL pointer if timeout occurred (only possible if wait <> 1). * NULL pointer if wait=0 and the faked page would have to be * before reuse. * **************************************/ SET_TDBB(tdbb); Database* const dbb = tdbb->getDatabase(); BufferControl *bcb = dbb->dbb_bcb; CCH_TRACE(("FAKE %d:%06d", window->win_page.getPageSpaceID(), window->win_page.getPageNum())); // if there has been a shadow added recently, go out and // find it before we grant any more write locks if (dbb->dbb_ast_flags & DBB_get_shadows) SDW_get_shadows(tdbb); BufferDesc* bdb = get_buffer(tdbb, window->win_page, SYNC_EXCLUSIVE, wait); if (!bdb) return NULL; // latch timeout occurred // If a dirty orphaned page is being reused - better write it first // to clear current precedences and checkpoint state. This would also // update the bcb_free_pages field appropriately. if (bdb->bdb_flags & (BDB_dirty | BDB_db_dirty)) { // If the caller didn't want to wait at all, then // return 'try to fake an other page' to the caller. if (!wait) { bdb->release(tdbb, true); return NULL; } if (!write_buffer(tdbb, bdb, bdb->bdb_page, true, tdbb->tdbb_status_vector, true)) { CCH_unwind(tdbb, true); } } else if (QUE_NOT_EMPTY(bdb->bdb_lower)) { // Clear residual precedence left over from AST-level I/O. Sync syncPrec(&bcb->bcb_syncPrecedence, "CCH_fake"); syncPrec.lock(SYNC_EXCLUSIVE); clear_precedence(tdbb, bdb); } // Here the page must not be dirty and have no backup lock owner fb_assert((bdb->bdb_flags & (BDB_dirty | BDB_db_dirty)) == 0); bdb->bdb_flags &= BDB_lru_chained; // yes, clear all except BDB_lru_chained bdb->bdb_flags |= (BDB_writer | BDB_faked); bdb->bdb_scan_count = 0; if (!(bcb->bcb_flags & BCB_exclusive)) lock_buffer(tdbb, bdb, LCK_WAIT, pag_undefined); MOVE_CLEAR(bdb->bdb_buffer, (SLONG) dbb->dbb_page_size); window->win_buffer = bdb->bdb_buffer; window->win_bdb = bdb; window->win_flags = 0; CCH_MARK(tdbb, window); return bdb->bdb_buffer; } pag* CCH_fetch(thread_db* tdbb, WIN* window, int lock_type, SCHAR page_type, int wait, const bool read_shadow) { /************************************** * * C C H _ f e t c h * ************************************** * * Functional description * Fetch a specific page. If it's already in cache, * so much the better. * * input * wait: 1 => Wait as long as necessary to get the latch. * This can cause deadlocks of course. * 0 => If the latch can't be acquired immediately, * give up and return 0. * => Latch timeout interval in seconds. * * return * PAG if successful. * NULL pointer if timeout occurred (only possible if wait <> 1). * **************************************/ SET_TDBB(tdbb); const LockState lockState = CCH_fetch_lock(tdbb, window, lock_type, wait, page_type); BufferDesc* bdb = window->win_bdb; SyncType syncType = (lock_type >= LCK_write) ? SYNC_EXCLUSIVE : SYNC_SHARED; switch (lockState) { case lsLocked: CCH_TRACE(("FE PAGE %d:%06d", window->win_page.getPageSpaceID(), window->win_page.getPageNum())); CCH_fetch_page(tdbb, window, read_shadow); // must read page from disk if (syncType != SYNC_EXCLUSIVE) bdb->downgrade(syncType); break; case lsLatchTimeout: case lsLockTimeout: return NULL; // latch or lock timeout } adjust_scan_count(window, lockState == lsLocked); // Validate the fetched page matches the expected type if (bdb->bdb_buffer->pag_type != page_type && page_type != pag_undefined) page_validation_error(tdbb, window, page_type); return window->win_buffer; } LockState CCH_fetch_lock(thread_db* tdbb, WIN* window, int lock_type, int wait, SCHAR page_type) { /************************************** * * C C H _ f e t c h _ l o c k * ************************************** * * Functional description * Fetch a latch and lock for a specific page. * * input * * wait: * LCK_WAIT (1) => Wait as long a necessary to get the lock. * This can cause deadlocks of course. * * LCK_NO_WAIT (0) => * If the latch can't be acquired immediately, give up and return -2. * If the lock can't be acquired immediately, give up and return -1. * * => Lock (latch) timeout interval in seconds. * * return * 0: fetch & lock were successful, page doesn't need to be read. * 1: fetch & lock were successful, page needs to be read from disk. * -1: lock timed out, fetch failed. * -2: latch timed out, fetch failed, lock not attempted. * **************************************/ SET_TDBB(tdbb); Database* const dbb = tdbb->getDatabase(); BufferControl* const bcb = dbb->dbb_bcb; // if there has been a shadow added recently, go out and // find it before we grant any more write locks if (dbb->dbb_ast_flags & DBB_get_shadows) SDW_get_shadows(tdbb); // Look for the page in the cache. BufferDesc* bdb = get_buffer(tdbb, window->win_page, ((lock_type >= LCK_write) ? SYNC_EXCLUSIVE : SYNC_SHARED), wait); if (wait != 1 && bdb == 0) return lsLatchTimeout; // latch timeout if (lock_type >= LCK_write) bdb->bdb_flags |= BDB_writer; window->win_bdb = bdb; window->win_buffer = bdb->bdb_buffer; if (bcb->bcb_flags & BCB_exclusive) return (bdb->bdb_flags & BDB_read_pending) ? lsLocked : lsLockedHavePage; // lock_buffer returns 0 or 1 or -1. const LockState lock_result = lock_buffer(tdbb, bdb, wait, page_type); return lock_result; } void CCH_fetch_page(thread_db* tdbb, WIN* window, const bool read_shadow) { /************************************** * * C C H _ f e t c h _ p a g e * ************************************** * * Functional description * Fetch a specific page. If it's already in cache, * so much the better. When "compute_checksum" is 1, compute * the checksum of the page. When it is 2, compute * the checksum only when the page type is nonzero. * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); BufferDesc* bdb = window->win_bdb; BufferControl* bcb = bdb->bdb_bcb; FbStatusVector* const status = tdbb->tdbb_status_vector; pag* page = bdb->bdb_buffer; bdb->bdb_incarnation = ++bcb->bcb_page_incarnation; tdbb->bumpStats(RuntimeStatistics::PAGE_READS); PageSpace* pageSpace = dbb->dbb_page_manager.findPageSpace(bdb->bdb_page.getPageSpaceID()); fb_assert(pageSpace); jrd_file* file = pageSpace->file; const bool isTempPage = pageSpace->isTemporary(); /* We will read a page, and if there is an I/O error we will try to use the shadow file, and try reading again, for a maximum of 3 tries, before it gives up. The read_shadow flag is set to false only in the call to FETCH_NO_SHADOW, which is only called from validate code. read_shadow = false -> IF an I/O error occurs give up (exit the loop, clean up, and return). So the caller, validate in most cases, can know about it and attempt to remedy the situation. read_shadow = true -> IF an I/O error occurs attempt to rollover to the shadow file. If the I/O error is persistant (more than 3 times) error out of the routine by calling CCH_unwind, and eventually punting out. */ class Pio : public CryptoManager::IOCallback { public: Pio(jrd_file* f, BufferDesc* b, bool tp, bool rs, PageSpace* ps) : file(f), bdb(b), isTempPage(tp), read_shadow(rs), pageSpace(ps) { } bool callback(thread_db* tdbb, FbStatusVector* status, Ods::pag* page) { Database *dbb = tdbb->getDatabase(); int retryCount = 0; while (!PIO_read(tdbb, file, bdb, page, status)) { if (isTempPage || !read_shadow) return false; if (!CCH_rollover_to_shadow(tdbb, dbb, file, false)) return false; if (file != pageSpace->file) file = pageSpace->file; else { if (retryCount++ == 3) { gds__log("IO error loop Unwind to avoid a hang\n"); return false; } } } return true; } private: jrd_file* file; BufferDesc* bdb; bool isTempPage; bool read_shadow; PageSpace* pageSpace; }; BackupManager* bm = dbb->dbb_backup_manager; BackupManager::StateReadGuard stateGuard(tdbb); const int bak_state = bm->getState(); fb_assert(bak_state != Ods::hdr_nbak_unknown); ULONG diff_page = 0; if (!isTempPage && bak_state != Ods::hdr_nbak_normal) { diff_page = bm->getPageIndex(tdbb, bdb->bdb_page.getPageNum()); NBAK_TRACE(("Reading page %d:%06d, state=%d, diff page=%d", bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum(), bak_state, diff_page)); } // In merge mode, if we are reading past beyond old end of file and page is in .delta file // then we maintain actual page in difference file. Always read it from there. if (isTempPage || bak_state == Ods::hdr_nbak_normal || !diff_page) { NBAK_TRACE(("Reading page %d:%06d, state=%d, diff page=%d from DISK", bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum(), bak_state, diff_page)); // Read page from disk as normal Pio io(file, bdb, isTempPage, read_shadow, pageSpace); if (!dbb->dbb_crypto_manager->read(tdbb, status, page, &io)) { if (read_shadow && !isTempPage) { PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); CCH_unwind(tdbb, true); } } } else { NBAK_TRACE(("Reading page %d, state=%d, diff page=%d from DIFFERENCE", bdb->bdb_page, bak_state, diff_page)); if (!bm->readDifference(tdbb, diff_page, page)) { PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); CCH_unwind(tdbb, true); } if (page->pag_type == 0 && page->pag_generation == 0 && page->pag_scn == 0) { // We encountered a page which was allocated, but never written to the // difference file. In this case we try to read the page from database. With // this approach if the page was old we get it from DISK, and if the page // was new IO error (EOF) or BUGCHECK (checksum error) will be the result. // Engine is not supposed to read a page which was never written unless // this is a merge process. NBAK_TRACE(("Re-reading page %d, state=%d, diff page=%d from DISK", bdb->bdb_page, bak_state, diff_page)); Pio io(file, bdb, false, read_shadow, pageSpace); if (!dbb->dbb_crypto_manager->read(tdbb, status, page, &io)) { if (read_shadow) { PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); CCH_unwind(tdbb, true); } } } } bdb->bdb_flags &= ~(BDB_not_valid | BDB_read_pending); window->win_buffer = bdb->bdb_buffer; } void CCH_forget_page(thread_db* tdbb, WIN* window) { /************************************** * * C C H _ f o r g e t _ p a g e * ************************************** * * Functional description * Page was faked but can't be written on disk. Most probably because * of out of disk space. Release page buffer and others resources and * unlink page from various queues * **************************************/ SET_TDBB(tdbb); BufferDesc* bdb = window->win_bdb; Database* dbb = tdbb->getDatabase(); if (window->win_page != bdb->bdb_page || bdb->bdb_buffer->pag_type != pag_undefined) return; // buffer was reassigned or page was reused window->win_bdb = NULL; if (bdb->bdb_flags & BDB_io_error) dbb->dbb_flags &= ~DBB_suspend_bgio; clear_dirty_flag_and_nbak_state(tdbb, bdb); bdb->bdb_flags = 0; BufferControl* bcb = dbb->dbb_bcb; removeDirty(bcb, bdb); QUE_DELETE(bdb->bdb_in_use); QUE_DELETE(bdb->bdb_que); QUE_INSERT(bcb->bcb_empty, bdb->bdb_que); if (tdbb->tdbb_flags & TDBB_no_cache_unwind) bdb->release(tdbb, true); } void CCH_fini(thread_db* tdbb) { /************************************** * * C C H _ f i n i * ************************************** * * Functional description * Shut down buffer operation. * **************************************/ SET_TDBB(tdbb); Database* const dbb = tdbb->getDatabase(); BufferControl* const bcb = dbb->dbb_bcb; if (!bcb) return; bcb_repeat* tail = bcb->bcb_rpt; const bcb_repeat* const end = tail + bcb->bcb_count; for (; tail < end; tail++) { if (tail->bcb_bdb) { delete tail->bcb_bdb; tail->bcb_bdb = NULL; } } delete[] bcb->bcb_rpt; bcb->bcb_rpt = NULL; bcb->bcb_count = 0; while (bcb->bcb_memory.hasData()) bcb->bcb_bufferpool->deallocate(bcb->bcb_memory.pop()); BufferControl::destroy(bcb); dbb->dbb_bcb = NULL; } void CCH_flush(thread_db* tdbb, USHORT flush_flag, TraNumber tra_number) { /************************************** * * C C H _ f l u s h * ************************************** * * Functional description * Flush all buffers. If the release flag is set, * release all locks. * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); // note that some of the code for btc_flush() // replicates code in the for loop // to minimize call overhead -- changes should be made in both places if (flush_flag & (FLUSH_TRAN | FLUSH_SYSTEM)) { const ULONG transaction_mask = tra_number ? 1L << (tra_number & (BITS_PER_LONG - 1)) : 0; bool sys_only = false; if (!transaction_mask && (flush_flag & FLUSH_SYSTEM)) sys_only = true; #ifdef SUPERSERVER_V2 BufferControl* bcb = dbb->dbb_bcb; //if (!dbb->dbb_wal && A && B) becomes //if (true && A && B) then finally (A && B) if (!(dbb->dbb_flags & DBB_force_write) && transaction_mask) { dbb->dbb_flush_cycle |= transaction_mask; if (!(bcb->bcb_flags & BCB_writer_active)) bcb->bcb_writer_sem.release(); } else #endif flushDirty(tdbb, transaction_mask, sys_only); } else flushAll(tdbb, flush_flag); // // Check if flush needed // const int max_unflushed_writes = dbb->dbb_config->getMaxUnflushedWrites(); const time_t max_unflushed_write_time = dbb->dbb_config->getMaxUnflushedWriteTime(); bool max_num = (max_unflushed_writes >= 0); bool max_time = (max_unflushed_write_time >= 0); bool doFlush = false; PageSpace* pageSpaceID = dbb->dbb_page_manager.findPageSpace(DB_PAGE_SPACE); jrd_file* main_file = pageSpaceID->file; // Avoid flush while creating and restoring database const Jrd::Attachment* att = tdbb->getAttachment(); const bool dontFlush = (dbb->dbb_flags & DBB_creating) || (dbb->dbb_ast_flags & DBB_shutdown_single) && att && (att->att_flags & (ATT_creator | ATT_system)); if (!(main_file->fil_flags & FIL_force_write) && (max_num || max_time) && !dontFlush) { const time_t now = time(0); SyncLockGuard guard(&dbb->dbb_flush_count_mutex, SYNC_EXCLUSIVE, "CCH_flush"); // If this is the first commit set last_flushed_write to now if (!dbb->last_flushed_write) dbb->last_flushed_write = now; const bool forceFlush = (flush_flag & FLUSH_ALL); // test max_num condition and max_time condition max_num = max_num && (dbb->unflushed_writes == max_unflushed_writes); max_time = max_time && (now - dbb->last_flushed_write > max_unflushed_write_time); if (forceFlush || max_num || max_time) { doFlush = true; dbb->unflushed_writes = 0; dbb->last_flushed_write = now; } else { dbb->unflushed_writes++; } } if (doFlush) { PIO_flush(tdbb, main_file); for (Shadow* shadow = dbb->dbb_shadow; shadow; shadow = shadow->sdw_next) PIO_flush(tdbb, shadow->sdw_file); BackupManager* bm = dbb->dbb_backup_manager; if (!bm->isShutDown()) { BackupManager::StateReadGuard stateGuard(tdbb); const int backup_state = bm->getState(); if (backup_state == Ods::hdr_nbak_stalled || backup_state == Ods::hdr_nbak_merge) bm->flushDifference(tdbb); } } // take the opportunity when we know there are no pages // in cache to check that the shadow(s) have not been // scheduled for shutdown or deletion SDW_check(tdbb); } void CCH_flush_ast(thread_db* tdbb) { /************************************** * * C C H _ f l u s h _ a s t * ************************************** * * Functional description * Flush all buffers coming from database file. * Should be called from AST * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); BufferControl* bcb = dbb->dbb_bcb; if (bcb->bcb_flags & BCB_exclusive) CCH_flush(tdbb, FLUSH_ALL, 0); else { // Do some fancy footwork to make sure that pages are // not removed from the btc tree at AST level. Then // restore the flag to whatever it was before. const bool keep_pages = bcb->bcb_flags & BCB_keep_pages; bcb->bcb_flags |= BCB_keep_pages; for (ULONG i = 0; (bcb = dbb->dbb_bcb) && i < bcb->bcb_count; i++) { BufferDesc* bdb = bcb->bcb_rpt[i].bcb_bdb; if (bdb->bdb_flags & (BDB_dirty | BDB_db_dirty)) down_grade(tdbb, bdb, 1); } if (!keep_pages) bcb->bcb_flags &= ~BCB_keep_pages; } } bool CCH_free_page(thread_db* tdbb) { /************************************** * * C C H _ f r e e _ p a g e * ************************************** * * Functional description * Check if the cache is below its free pages * threshold and write a page on the LRU tail. * **************************************/ // Called by VIO/garbage_collector() when it is idle to // help quench the thirst for free pages. Database* dbb = tdbb->getDatabase(); BufferControl* bcb = dbb->dbb_bcb; if (dbb->readOnly()) return false; BufferDesc* bdb; if ((bcb->bcb_flags & BCB_free_pending) && (bdb = get_buffer(tdbb, FREE_PAGE, SYNC_NONE, 1))) { if (write_buffer(tdbb, bdb, bdb->bdb_page, true, tdbb->tdbb_status_vector, true)) return true; CCH_unwind(tdbb, false); } return false; } SLONG CCH_get_incarnation(WIN* window) { /************************************** * * C C H _ g e t _ i n c a r n a t i o n * ************************************** * * Functional description * Get page incarnation associated with buffer. * **************************************/ return window->win_bdb->bdb_incarnation; } void CCH_get_related(thread_db* tdbb, PageNumber page, PagesArray &lowPages) { /************************************** * * C C H _ g e t _ r e l a t e d * ************************************** * * Functional description * Collect all pages, dependent on given page (i.e. all pages which must be * written after given page). To do it, walk low part of precedence graph * starting from given page and put its numbers into array. * **************************************/ Database* dbb = tdbb->getDatabase(); BufferControl* bcb = dbb->dbb_bcb; Sync bcbSync(&bcb->bcb_syncObject, "CCH_get_related"); bcbSync.lock(SYNC_SHARED); BufferDesc* bdb = find_buffer(bcb, page, false); bcbSync.unlock(); if (bdb) { Sync precSync(&bcb->bcb_syncPrecedence, "CCH_get_related"); precSync.lock(SYNC_EXCLUSIVE); const ULONG mark = get_prec_walk_mark(bcb); get_related(bdb, lowPages, PRE_SEARCH_LIMIT, mark); } } pag* CCH_handoff(thread_db* tdbb, WIN* window, ULONG page, int lock, SCHAR page_type, int wait, const bool release_tail) { /************************************** * * C C H _ h a n d o f f * ************************************** * * Functional description * Follow a pointer handing off the lock. Fetch the new page * before retiring the old page lock. * * input * wait: 1 => Wait as long as necessary to get the latch. * This can cause deadlocks of course. * 0 => If the latch can't be acquired immediately, * give up and return 0. * => Latch timeout interval in seconds. * * return * PAG if successful. * 0 if a latch timeout occurred (only possible if wait <> 1). * The latch on the fetched page is downgraded to shared. * The fetched page is unmarked. * **************************************/ // The update, if there was one, of the input page is complete. // The cache buffer can be 'unmarked'. It is important to // unmark before CCH_unwind is (might be) called. SET_TDBB(tdbb); CCH_TRACE(("HANDOFF %d:%06d->%06d, %s", window->win_page.getPageSpaceID(), window->win_page.getPageNum(), page, (lock >= LCK_write) ? "EX" : "SH")); BufferDesc *bdb = window->win_bdb; // unmark if (bdb->bdb_writers == 1 && (bdb->bdb_flags & BDB_marked)) { bdb->bdb_flags &= ~BDB_marked; bdb->unLockIO(tdbb); } // If the 'from-page' and 'to-page' of the handoff are the // same and the latch requested is shared then downgrade it. if ((window->win_page.getPageNum() == page) && (lock == LCK_read)) { if (bdb->ourExclusiveLock()) bdb->downgrade(SYNC_SHARED); return window->win_buffer; } WIN temp = *window; window->win_page = PageNumber(window->win_page.getPageSpaceID(), page); // This prevents a deadlock with the precedence queue, as shown by // mwrite mwrite1 2 mwrite2 2 test.fdb const int wait2 = bdb->ourExclusiveLock() ? LCK_NO_WAIT : wait; LockState must_read = CCH_fetch_lock(tdbb, window, lock, wait2, page_type); if ((must_read == lsLatchTimeout || must_read == lsLockTimeout) && wait2 == LCK_NO_WAIT) { temp.win_bdb->downgrade(SYNC_SHARED); must_read = CCH_fetch_lock(tdbb, window, lock, wait, page_type); } // Latch or lock timeout, return failure. if (must_read == lsLatchTimeout || must_read == lsLockTimeout) { *window = temp; CCH_RELEASE(tdbb, window); return NULL; } if (release_tail) CCH_RELEASE_TAIL(tdbb, &temp); else CCH_RELEASE(tdbb, &temp); if (must_read != lsLockedHavePage) CCH_fetch_page(tdbb, window, true); bdb = window->win_bdb; if (lock != LCK_write && must_read != lsLockedHavePage) { if (bdb->ourExclusiveLock()) bdb->downgrade(SYNC_SHARED); } adjust_scan_count(window, must_read == lsLocked); // Validate the fetched page matches the expected type if (bdb->bdb_buffer->pag_type != page_type && page_type != pag_undefined) page_validation_error(tdbb, window, page_type); return window->win_buffer; } void CCH_init(thread_db* tdbb, ULONG number) { /************************************** * * C C H _ i n i t * ************************************** * * Functional description * Initialize the cache. Allocate buffers control block, * buffer descriptors, and actual buffers. * **************************************/ SET_TDBB(tdbb); Database* const dbb = tdbb->getDatabase(); const bool shared = (dbb->dbb_flags & DBB_shared); CCH_TRACE(("INIT %s", dbb->dbb_filename.c_str())); // Check for database-specific page buffers if (dbb->dbb_page_buffers) number = dbb->dbb_page_buffers; // Enforce page buffer cache constraints if (number < MIN_PAGE_BUFFERS) number = MIN_PAGE_BUFFERS; if (number > MAX_PAGE_BUFFERS) number = MAX_PAGE_BUFFERS; const SLONG count = number; // Allocate and initialize buffers control block BufferControl* bcb = BufferControl::create(dbb); while (true) { try { bcb->bcb_rpt = FB_NEW_POOL(*bcb->bcb_bufferpool) bcb_repeat[number]; break; } catch (const Firebird::Exception& ex) { ex.stuffException(tdbb->tdbb_status_vector); // If the buffer control block can't be allocated, memory is // very low. Recalculate the number of buffers to account for // page buffer overhead and reduce that number by a 25% fudge factor. number = (sizeof(bcb_repeat) * number) / PAGE_OVERHEAD; number -= number >> 2; if (number < MIN_PAGE_BUFFERS) ERR_post(Arg::Gds(isc_cache_too_small)); } } dbb->dbb_bcb = bcb; bcb->bcb_page_size = dbb->dbb_page_size; bcb->bcb_database = dbb; bcb->bcb_flags = shared ? BCB_exclusive : 0; //bcb->bcb_flags = BCB_exclusive; // TODO detect real state using LM QUE_INIT(bcb->bcb_in_use); QUE_INIT(bcb->bcb_dirty); bcb->bcb_dirty_count = 0; QUE_INIT(bcb->bcb_empty); // initialization of memory is system-specific bcb->bcb_count = memory_init(tdbb, bcb, static_cast(number)); bcb->bcb_free_minimum = (SSHORT) MIN(bcb->bcb_count / 4, 128); if (bcb->bcb_count < MIN_PAGE_BUFFERS) ERR_post(Arg::Gds(isc_cache_too_small)); // Log if requested number of page buffers could not be allocated. if (count != (SLONG) bcb->bcb_count) { gds__log("Database: %s\n\tAllocated %ld page buffers of %ld requested", tdbb->getAttachment()->att_filename.c_str(), bcb->bcb_count, count); } if (dbb->dbb_lock->lck_logical != LCK_EX) dbb->dbb_ast_flags |= DBB_assert_locks; } void CCH_init2(thread_db* tdbb) { Database* dbb = tdbb->getDatabase(); BufferControl* bcb = dbb->dbb_bcb; if (!(bcb->bcb_flags & BCB_exclusive) || (bcb->bcb_flags & (BCB_cache_writer | BCB_writer_start))) return; #ifdef CACHE_READER if (gds__thread_start(cache_reader, dbb, THREAD_high, 0, 0)) { ERR_bugcheck_msg("cannot start thread"); } { // scope Database::Checkout dcoHolder(dbb, FB_FUNCTION); dbb->dbb_reader_init.enter(); } #endif const Attachment* att = tdbb->getAttachment(); if (!(dbb->dbb_flags & DBB_read_only) && !(att->att_flags & ATT_security_db)) { // writer startup in progress bcb->bcb_flags |= BCB_writer_start; try { bcb->bcb_writer_fini.run(bcb); } catch (const Exception&) { bcb->bcb_flags &= ~BCB_writer_start; ERR_bugcheck_msg("cannot start cache writer thread"); } bcb->bcb_writer_init.enter(); } } void CCH_mark(thread_db* tdbb, WIN* window, bool mark_system, bool must_write) { /************************************** * * C C H _ m a r k * ************************************** * * Functional description * Mark a window as dirty. * **************************************/ BufferDesc* bdb = window->win_bdb; BLKCHK(bdb, type_bdb); SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); tdbb->bumpStats(RuntimeStatistics::PAGE_MARKS); BufferControl* bcb = dbb->dbb_bcb; if (!(bdb->bdb_flags & BDB_writer)) BUGCHECK(208); // msg 208 page not accessed for write CCH_TRACE(("MARK %d:%06d", window->win_page.getPageSpaceID(), window->win_page.getPageNum())); // A LATCH_mark is needed before the BufferDesc can be marked. // This prevents a write while the page is being modified. if (!(bdb->bdb_flags & BDB_marked)) bdb->lockIO(tdbb); fb_assert(bdb->ourIOLock()); // Allocate difference page (if in stalled mode) before mark page as dirty. // It guarantees that disk space is allocated and page could be written later. if (!set_diff_page(tdbb, bdb)) { clear_dirty_flag_and_nbak_state(tdbb, bdb); bdb->unLockIO(tdbb); CCH_unwind(tdbb, true); } fb_assert(dbb->dbb_backup_manager->getState() != Ods::hdr_nbak_unknown); bdb->bdb_incarnation = ++bcb->bcb_page_incarnation; // mark the dirty bit vector for this specific transaction, // if it exists; otherwise mark that the system transaction // has updated this page int newFlags = 0; TraNumber number; jrd_tra* transaction = tdbb->getTransaction(); if (transaction && (number = transaction->tra_number)) { if (!(tdbb->tdbb_flags & TDBB_sweeper)) { const ULONG trans_bucket = number & (BITS_PER_LONG - 1); bdb->bdb_transactions |= (1L << trans_bucket); if (number > bdb->bdb_mark_transaction) bdb->bdb_mark_transaction = number; } } else newFlags |= BDB_system_dirty; if (mark_system) newFlags |= BDB_system_dirty; /*if (bcb->bcb_flags & BCB_exclusive) */ newFlags |= BDB_db_dirty; if (must_write || dbb->dbb_backup_manager->databaseFlushInProgress()) newFlags |= BDB_must_write; bdb->bdb_flags |= newFlags; if (!(tdbb->tdbb_flags & TDBB_sweeper) || (bdb->bdb_flags & BDB_system_dirty)) insertDirty(bcb, bdb); bdb->bdb_flags |= BDB_marked | BDB_dirty; } void CCH_must_write(thread_db* tdbb, WIN* window) { /************************************** * * C C H _ m u s t _ w r i t e * ************************************** * * Functional description * Mark a window as "must write". * **************************************/ SET_TDBB(tdbb); BufferDesc* bdb = window->win_bdb; BLKCHK(bdb, type_bdb); if (!(bdb->bdb_flags & BDB_marked) || !(bdb->bdb_flags & BDB_dirty)) { BUGCHECK(208); // msg 208 page not accessed for write } bdb->bdb_flags |= BDB_must_write | BDB_dirty; fb_assert((bdb->bdb_flags & BDB_nbak_state_lock) || PageSpace::isTemporary(bdb->bdb_page.getPageSpaceID())); } void CCH_precedence(thread_db* tdbb, WIN* window, ULONG pageNum) { const USHORT pageSpaceID = pageNum > FIRST_PIP_PAGE ? window->win_page.getPageSpaceID() : DB_PAGE_SPACE; CCH_precedence(tdbb, window, PageNumber(pageSpaceID, pageNum)); } void CCH_tra_precedence(thread_db* tdbb, WIN* window, TraNumber traNum) { /* if (traNum <= tdbb->getDatabase()->dbb_last_header_write) { return; } */ check_precedence(tdbb, window, PageNumber(TRANS_PAGE_SPACE, traNum)); } void CCH_precedence(thread_db* tdbb, WIN* window, PageNumber page) { /************************************** * * C C H _ p r e c e d e n c e * ************************************** * * Functional description * Given a window accessed for write and a page number, * establish a precedence relationship such that the * specified page will always be written before the page * associated with the window. * * If the "page number" is negative, it is really a transaction * id. In this case, the precedence relationship is to the * database header page from which the transaction id was * obtained. If the header page has been written since the * transaction id was assigned, no precedence relationship * is required. * **************************************/ // If the page is zero, the caller isn't really serious if (page.getPageNum() == 0) return; // no need to support precedence for temporary pages if (page.isTemporary() || window->win_page.isTemporary()) return; check_precedence(tdbb, window, page); } #ifdef CACHE_READER void CCH_prefetch(thread_db* tdbb, SLONG* pages, SSHORT count) { /************************************** * * C C H _ p r e f e t c h * ************************************** * * Functional description * Given a vector of pages, set corresponding bits * in global prefetch bitmap. Initiate an asynchronous * I/O and get the cache reader reading in our behalf * as well. * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); BufferControl* bcb = dbb->dbb_bcb; if (!count || !(bcb->bcb_flags & BCB_cache_reader)) { // Caller isn't really serious. return; } // Switch default pool to permanent pool for setting bits in prefetch bitmap. Jrd::ContextPoolHolder context(tdbb, bcb->bcb_bufferpool); // The global prefetch bitmap is the key to the I/O coalescense mechanism which dovetails // all thread prefetch requests to minimize sequential I/O requests. // It also enables multipage I/O by implicitly sorting page vector requests. SLONG first_page = 0; for (const SLONG* const end = pages + count; pages < end;) { const SLONG page = *pages++; if (page) { SBM_set(tdbb, &bcb->bcb_prefetch, page); if (!first_page) first_page = page; } } // Not likely that the caller's page vector was empty but check anyway. if (first_page) { prf prefetch; prefetch_init(&prefetch, tdbb); prefetch_prologue(&prefetch, &first_page); prefetch_io(&prefetch, tdbb->tdbb_status_vector); prefetch_epilogue(&prefetch, tdbb->tdbb_status_vector); } } bool CCH_prefetch_pages(thread_db* tdbb) { /************************************** * * C C H _ p r e f e t c h _ p a g e s * ************************************** * * Functional description * Check the prefetch bitmap for a set * of pages and read them into the cache. * **************************************/ // Placeholder to be implemented when predictive prefetch is // enabled. This is called by VIO/garbage_collector() when it // is idle to help satisfy the prefetch demand. return false; } #endif // CACHE_READER bool set_diff_page(thread_db* tdbb, BufferDesc* bdb) { Database* const dbb = tdbb->getDatabase(); BackupManager* const bm = dbb->dbb_backup_manager; // Temporary pages don't write to delta and need no SCN PageSpace* pageSpace = dbb->dbb_page_manager.findPageSpace(bdb->bdb_page.getPageSpaceID()); fb_assert(pageSpace); if (pageSpace->isTemporary()) return true; // Take backup state lock if (!(tdbb->tdbb_flags & TDBB_backup_write_locked)) { const AtomicCounter::counter_type oldFlags = bdb->bdb_flags.exchangeBitOr(BDB_nbak_state_lock); if (!(oldFlags & BDB_nbak_state_lock)) { NBAK_TRACE(("lock state for dirty page %d:%06d", bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum())); bm->lockStateRead(tdbb, LCK_WAIT); } } else fb_assert(bdb->bdb_page == HEADER_PAGE_NUMBER); if (bdb->bdb_page != HEADER_PAGE_NUMBER) { // SCN of header page is adjusted in nbak.cpp if (bdb->bdb_buffer->pag_scn != bm->getCurrentSCN()) { bdb->bdb_buffer->pag_scn = bm->getCurrentSCN(); // Set SCN for the page // At PAG_set_page_scn() below we could dirty SCN page and thus acquire // nbackup state lock recursively. Since RWLock allows it, we are safe. // Else we should release just taken state lock before call of // PAG_set_page_scn() and acquire it again. If current SCN changes meanwhile // we should repeat whole process again... win window(bdb->bdb_page); window.win_bdb = bdb; window.win_buffer = bdb->bdb_buffer; PAG_set_page_scn(tdbb, &window); } fb_assert(bdb->bdb_buffer->pag_scn == bm->getCurrentSCN()); } // Determine location of the page in difference file and write destination // so BufferDesc AST handlers and write_page routine can safely use this information const int backup_state = bm->getState(); if (backup_state == Ods::hdr_nbak_normal) return true; switch (backup_state) { case Ods::hdr_nbak_stalled: bdb->bdb_difference_page = bm->getPageIndex(tdbb, bdb->bdb_page.getPageNum()); if (!bdb->bdb_difference_page) { bdb->bdb_difference_page = bm->allocateDifferencePage(tdbb, bdb->bdb_page.getPageNum()); if (!bdb->bdb_difference_page) { return false; } NBAK_TRACE(("Allocate difference page %d for database page %d", bdb->bdb_difference_page, bdb->bdb_page)); } else { NBAK_TRACE(("Map existing difference page %d to database page %d", bdb->bdb_difference_page, bdb->bdb_page)); } break; case Ods::hdr_nbak_merge: bdb->bdb_difference_page = bm->getPageIndex(tdbb, bdb->bdb_page.getPageNum()); if (bdb->bdb_difference_page) { NBAK_TRACE(("Map existing difference page %d to database page %d (write_both)", bdb->bdb_difference_page, bdb->bdb_page)); } break; } return true; } void CCH_release(thread_db* tdbb, WIN* window, const bool release_tail) { /************************************** * * C C H _ r e l e a s e * ************************************** * * Functional description * Release a window. If the release_tail * flag is true then make the buffer * least-recently-used. * **************************************/ SET_TDBB(tdbb); BufferDesc* const bdb = window->win_bdb; BLKCHK(bdb, type_bdb); BufferControl* bcb = bdb->bdb_bcb; CCH_TRACE(("RELEASE %d:%06d", window->win_page.getPageSpaceID(), window->win_page.getPageNum())); // A large sequential scan has requested that the garbage // collector garbage collect. Mark the buffer so that the // page isn't released to the LRU tail before the garbage // collector can process the page. if (window->win_flags & WIN_large_scan && window->win_flags & WIN_garbage_collect) { bdb->bdb_flags |= BDB_garbage_collect; window->win_flags &= ~WIN_garbage_collect; } const bool mustWrite = (bdb->bdb_flags & BDB_must_write) || bcb->bcb_database->dbb_backup_manager->databaseFlushInProgress(); if (bdb->bdb_writers == 1 || bdb->bdb_use_count == 1 || (bdb->bdb_writers == 0 && mustWrite)) { const bool marked = bdb->bdb_flags & BDB_marked; bdb->bdb_flags &= ~(BDB_writer | BDB_marked | BDB_faked); if (marked) bdb->unLockIO(tdbb); if (mustWrite) { // Downgrade exclusive latch to shared to allow concurrent share access // to page during I/O. bdb->downgrade(SYNC_SHARED); if (!write_buffer(tdbb, bdb, bdb->bdb_page, false, tdbb->tdbb_status_vector, true)) { insertDirty(bcb, bdb); CCH_unwind(tdbb, true); } } } if (bdb->bdb_use_count == 1) { if (bdb->bdb_flags & BDB_no_blocking_ast) { if (bdb->bdb_flags & (BDB_db_dirty | BDB_dirty)) { if (!write_buffer(tdbb, bdb, bdb->bdb_page, false, tdbb->tdbb_status_vector, true)) { // Reassert blocking AST after write failure with dummy lock convert // to same level. This will re-enable blocking AST notification. { // scope ThreadStatusGuard temp_status(tdbb); LCK_convert_opt(tdbb, bdb->bdb_lock, bdb->bdb_lock->lck_logical); } CCH_unwind(tdbb, true); } } PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); bdb->bdb_flags &= ~BDB_no_blocking_ast; bdb->bdb_ast_flags &= ~BDB_blocking; } // Make buffer the least-recently-used by queueing it to the LRU tail if (release_tail) { if ((window->win_flags & WIN_large_scan && bdb->bdb_scan_count > 0 && !(--bdb->bdb_scan_count) && !(bdb->bdb_flags & BDB_garbage_collect)) || (window->win_flags & WIN_garbage_collector && bdb->bdb_flags & BDB_garbage_collect && !bdb->bdb_scan_count)) { if (window->win_flags & WIN_garbage_collector) bdb->bdb_flags &= ~BDB_garbage_collect; { // bcb_syncLRU scope Sync lruSync(&bcb->bcb_syncLRU, "CCH_release"); lruSync.lock(SYNC_EXCLUSIVE); if (bdb->bdb_flags & BDB_lru_chained) { requeueRecentlyUsed(bcb); } QUE_DELETE(bdb->bdb_in_use); QUE_APPEND(bcb->bcb_in_use, bdb->bdb_in_use); } if ((bcb->bcb_flags & BCB_cache_writer) && (bdb->bdb_flags & (BDB_dirty | BDB_db_dirty)) ) { insertDirty(bcb, bdb); bcb->bcb_flags |= BCB_free_pending; if (!(bcb->bcb_flags & BCB_writer_active)) { bcb->bcb_writer_sem.release(); } } } } } bdb->release(tdbb, true); window->win_bdb = NULL; } void CCH_release_exclusive(thread_db* tdbb) { /************************************** * * C C H _ r e l e a s e _ e x c l u s i v e * ************************************** * * Functional description * Release exclusive access to database. * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); dbb->dbb_flags &= ~DBB_exclusive; Jrd::Attachment* attachment = tdbb->getAttachment(); if (attachment) attachment->att_flags &= ~ATT_exclusive; if (dbb->dbb_ast_flags & DBB_blocking) LCK_re_post(tdbb, dbb->dbb_lock); } bool CCH_rollover_to_shadow(thread_db* tdbb, Database* dbb, jrd_file* file, const bool inAst) { /************************************** * * C C H _ r o l l o v e r _ t o _ s h a d o w * ************************************** * * Functional description * An I/O error has been detected on the * main database file. Roll over to use * the shadow file. * **************************************/ SET_TDBB(tdbb); // Is the shadow subsystem yet initialized if (!dbb->dbb_shadow_lock) return false; // hvlad: if there are no shadows can't rollover // this is a temporary solution to prevent 100% CPU load // in write_page in case of PIO_write failure if (!dbb->dbb_shadow) return false; // notify other process immediately to ensure all read from sdw // file instead of db file return SDW_rollover_to_shadow(tdbb, file, inAst); } void CCH_shutdown(thread_db* tdbb) { /************************************** * * C C H _ s h u t d o w n * ************************************** * * Functional description * Shutdown database physical page locks. * **************************************/ Database* const dbb = tdbb->getDatabase(); BufferControl* const bcb = dbb->dbb_bcb; if (!bcb) return; #ifdef CACHE_READER // Shutdown the dedicated cache reader for this database if (bcb->bcb_flags & BCB_cache_reader) { bcb->bcb_flags &= ~BCB_cache_reader; dbb->dbb_reader_sem.release(); dbb->dbb_reader_fini.enter(); } #endif // Wait for cache writer startup to complete while (bcb->bcb_flags & BCB_writer_start) Thread::yield(); // Shutdown the dedicated cache writer for this database if (bcb->bcb_flags & BCB_cache_writer) { bcb->bcb_flags &= ~BCB_cache_writer; bcb->bcb_writer_sem.release(); // Wake up running thread bcb->bcb_writer_fini.waitForCompletion(); } SyncLockGuard bcbSync(&bcb->bcb_syncObject, SYNC_EXCLUSIVE, "CCH_shutdown"); // Flush and release page buffers bcb_repeat* tail = bcb->bcb_rpt; const bcb_repeat* const end = tail + bcb->bcb_count; if (tail && tail->bcb_bdb) { try { if (dbb->dbb_flags & DBB_bugcheck) LongJump::raise(); CCH_flush(tdbb, FLUSH_FINI, 0); } catch (const Exception&) { for (; tail < end; tail++) { BufferDesc* const bdb = tail->bcb_bdb; if (dbb->dbb_flags & DBB_bugcheck) { bdb->bdb_flags &= ~BDB_db_dirty; clear_dirty_flag_and_nbak_state(tdbb, bdb); } PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); } } } // close the database file and all associated shadow files dbb->dbb_page_manager.closeAll(); SDW_close(); } void CCH_unwind(thread_db* tdbb, const bool punt) { /************************************** * * C C H _ u n w i n d * ************************************** * * Functional description * Synchronously unwind cache after I/O or lock error. * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); // CCH_unwind is called when any of the following occurs: // - IO error // - journaling error => obsolete // - bad page checksum => obsolete // - wrong page type // - page locking (not latching) deadlock BufferControl* bcb = dbb->dbb_bcb; if (!bcb || (tdbb->tdbb_flags & TDBB_no_cache_unwind)) { if (punt) ERR_punt(); return; } // A cache error has occurred. Scan the cache for buffers // which may be in use and release them. for (FB_SIZE_T n = 0; n < tdbb->tdbb_bdbs.getCount(); ++n) { BufferDesc *bdb = tdbb->tdbb_bdbs[n]; if (bdb) { if (bdb->bdb_flags & BDB_marked) BUGCHECK(268); // msg 268 buffer marked during cache unwind if (bdb->ourIOLock()) { bdb->unLockIO(tdbb); } else { if (bdb->ourExclusiveLock()) bdb->bdb_flags &= ~(BDB_writer | BDB_faked | BDB_must_write); bdb->release(tdbb, true); } } } /*** bcb_repeat* tail = bcb->bcb_rpt; for (const bcb_repeat* const end = tail + bcb->bcb_count; tail < end; tail++) { BufferDesc* bdb = tail->bcb_bdb; if (!bdb->bdb_use_count) { continue; } if (bdb->bdb_io == tdbb) { release_bdb(tdbb, bdb, true, false, false); } if (bdb->bdb_exclusive == tdbb) { if (bdb->bdb_flags & BDB_marked) { BUGCHECK(268); // msg 268 buffer marked during cache unwind } bdb->bdb_flags &= ~(BDB_writer | BDB_faked | BDB_must_write); release_bdb(tdbb, bdb, true, false, false); } // hvlad : as far as I understand thread can't hold more than two shared latches // on the same bdb, so findSharedLatch below will not be called many times SharedLatch* latch = findSharedLatch(tdbb, bdb); while (latch) { release_bdb(tdbb, bdb, true, false, false); latch = findSharedLatch(tdbb, bdb); } #ifndef SUPERSERVER const pag* const page = bdb->bdb_buffer; if (page->pag_type == pag_header || page->pag_type == pag_transactions) { ++bdb->bdb_use_count; clear_dirty_flag_and_nbak_state(tdbb, bdb); bdb->bdb_flags &= ~(BDB_writer | BDB_marked | BDB_faked | BDB_db_dirty); PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); --bdb->bdb_use_count; } #endif } ***/ tdbb->tdbb_flags |= TDBB_cache_unwound; if (punt) ERR_punt(); } bool CCH_validate(WIN* window) { /************************************** * * C C H _ v a l i d a t e * ************************************** * * Functional description * Give a page a quick once over looking for unhealthyness. * **************************************/ BufferDesc* bdb = window->win_bdb; // If page is marked for write, checksum is questionable if ((bdb->bdb_flags & (BDB_dirty | BDB_db_dirty))) return true; return (bdb->bdb_buffer->pag_pageno == bdb->bdb_page.getPageNum()); } bool CCH_write_all_shadows(thread_db* tdbb, Shadow* shadow, BufferDesc* bdb, Ods::pag* page, FbStatusVector* status, const bool inAst) { /************************************** * * C C H _ w r i t e _ a l l _ s h a d o w s * ************************************** * * Functional description * Compute a checksum and write a page out to all shadows * detecting failure on write. * If shadow is null, write to all shadows, otherwise only to specified * shadow. * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); Shadow* sdw = shadow ? shadow : dbb->dbb_shadow; if (!sdw) return true; bool result = true; Firebird::UCharBuffer spare_buffer; if (bdb->bdb_page == HEADER_PAGE_NUMBER) { Ods::pag* newPage = (pag*) spare_buffer.getBuffer(dbb->dbb_page_size); memcpy(newPage, page, HDR_SIZE); page = newPage; memset((UCHAR*) page + HDR_SIZE, 0, dbb->dbb_page_size - HDR_SIZE); } page->pag_pageno = bdb->bdb_page.getPageNum(); for (; sdw; sdw = sdw->sdw_next) { // don't bother to write to the shadow if it is no longer viable /* Fix for bug 7925. drop_gdb fails to remove secondary file if the shadow is conditional. Reason being the header page not being correctly initialized. The following block was not being performed for a conditional shadow since SDW_INVALID expanded to include conditional shadow -Sudesh 07/10/95 old code --> if (sdw->sdw_flags & SDW_INVALID) */ if ((sdw->sdw_flags & SDW_INVALID) && !(sdw->sdw_flags & SDW_conditional)) continue; if (bdb->bdb_page == HEADER_PAGE_NUMBER) { // fixup header for shadow file jrd_file* shadow_file = sdw->sdw_file; header_page* header = (header_page*) page; PageSpace* pageSpaceID = dbb->dbb_page_manager.findPageSpace(DB_PAGE_SPACE); const UCHAR* q = (UCHAR *) pageSpaceID->file->fil_string; header->hdr_data[0] = HDR_end; header->hdr_end = HDR_SIZE; header->hdr_next_page = 0; PAG_add_header_entry(tdbb, header, HDR_root_file_name, (USHORT) strlen((const char*) q), q); jrd_file* next_file = shadow_file->fil_next; if (next_file) { q = (UCHAR *) next_file->fil_string; const SLONG last = next_file->fil_min_page - 1; PAG_add_header_entry(tdbb, header, HDR_file, (USHORT) strlen((const char*) q), q); PAG_add_header_entry(tdbb, header, HDR_last_page, sizeof(last), (const UCHAR*) &last); } header->hdr_flags |= hdr_active_shadow; header->hdr_header.pag_pageno = bdb->bdb_page.getPageNum(); } // This condition makes sure that PIO_write is performed in case of // conditional shadow only if the page is Header page // // -Sudesh 07/10/95 if ((sdw->sdw_flags & SDW_conditional) && bdb->bdb_page != HEADER_PAGE_NUMBER) continue; // if a write failure happens on an AUTO shadow, mark the // shadow to be deleted at the next available opportunity when we // know we don't have a page fetched if (!PIO_write(tdbb, sdw->sdw_file, bdb, page, status)) { if (sdw->sdw_flags & SDW_manual) result = false; else { sdw->sdw_flags |= SDW_delete; if (!inAst && SDW_check_conditional(tdbb)) { if (SDW_lck_update(tdbb, 0)) { SDW_notify(tdbb); CCH_unwind(tdbb, false); SDW_dump_pages(tdbb); ERR_post(Arg::Gds(isc_deadlock)); } } } } // If shadow was specified, break out of loop if (shadow) { break; } } return result; } static void adjust_scan_count(WIN* window, bool mustRead) { /************************************** * * a d j u s t _ s c a n _ c o u n t * **************************************/ BufferDesc* bdb = window->win_bdb; // If a page was read or prefetched on behalf of a large scan // then load the window scan count into the buffer descriptor. // This buffer scan count is decremented by releasing a buffer // with CCH_RELEASE_TAIL. // Otherwise zero the buffer scan count to prevent the buffer // from being queued to the LRU tail. if (window->win_flags & WIN_large_scan) { if (mustRead || (bdb->bdb_flags & BDB_prefetch) || bdb->bdb_scan_count < 0) bdb->bdb_scan_count = window->win_scans; } else if (window->win_flags & WIN_garbage_collector) { if (mustRead) bdb->bdb_scan_count = -1; if (bdb->bdb_flags & BDB_garbage_collect) window->win_flags |= WIN_garbage_collect; } else if (window->win_flags & WIN_secondary) { if (mustRead) bdb->bdb_scan_count = -1; } else { bdb->bdb_scan_count = 0; if (bdb->bdb_flags & BDB_garbage_collect) bdb->bdb_flags &= ~BDB_garbage_collect; } } static BufferDesc* alloc_bdb(thread_db* tdbb, BufferControl* bcb, UCHAR** memory) { /************************************** * * a l l o c _ b d b * ************************************** * * Functional description * Allocate buffer descriptor block. * **************************************/ SET_TDBB(tdbb); BufferDesc* bdb = FB_NEW_POOL(*bcb->bcb_bufferpool) BufferDesc(bcb); try { bdb->bdb_lock = alloc_page_lock(tdbb, bdb); } catch (const Firebird::Exception&) { delete bdb; throw; } bdb->bdb_buffer = (pag*) *memory; *memory += bcb->bcb_page_size; QUE_INSERT(bcb->bcb_empty, bdb->bdb_que); return bdb; } static Lock* alloc_page_lock(thread_db* tdbb, BufferDesc* bdb) { /************************************** * * a l l o c _ p a g e _ l o c k * ************************************** * * Functional description * Allocate a page-type lock. * **************************************/ SET_TDBB(tdbb); Database* const dbb = tdbb->getDatabase(); BufferControl* const bcb = bdb->bdb_bcb; const USHORT lockLen = PageNumber::getLockLen(); return FB_NEW_RPT(*bcb->bcb_bufferpool, lockLen) Lock(tdbb, lockLen, LCK_bdb, bdb, blocking_ast_bdb); } static int blocking_ast_bdb(void* ast_object) { /************************************** * * b l o c k i n g _ a s t _ b d b * ************************************** * * Functional description * Blocking AST for buffer control blocks. This is called at * AST (signal) level to indicate that a lock is blocking another * process. If the BufferDesc* is in use, set a flag indicating that the * lock is blocking and return. When the BufferDesc is released at normal * level the lock will be down graded. If the BufferDesc* is not in use, * write the page if dirty then downgrade lock. Things would be * much hairier if UNIX supported asynchronous IO, but it doesn't. * WHEW! * **************************************/ // CVC: I assume we need the function call but not the variable. /*ThreadSync* const thread = */ThreadSync::getThread("blocking_ast_bdb"); BufferDesc* const bdb = static_cast(ast_object); try { BufferControl* const bcb = bdb->bdb_bcb; fb_assert(!(bcb->bcb_flags & BCB_exclusive)); Database* const dbb = bcb->bcb_database; fb_assert(dbb); AsyncContextHolder tdbb(dbb, FB_FUNCTION); // Do some fancy footwork to make sure that pages are // not removed from the btc tree at AST level. Then // restore the flag to whatever it was before. const bool keep_pages = (bcb->bcb_flags & BCB_keep_pages) != 0; bcb->bcb_flags |= BCB_keep_pages; down_grade(tdbb, bdb); if (!keep_pages) bcb->bcb_flags &= ~BCB_keep_pages; if (tdbb->tdbb_status_vector->getState() & IStatus::STATE_ERRORS) iscDbLogStatus(dbb->dbb_filename.c_str(), tdbb->tdbb_status_vector); } catch (const Firebird::Exception&) { return -1; } // no-op return 0; } // Remove cleared precedence blocks from high precedence queue static void purgePrecedence(BufferControl* bcb, BufferDesc* bdb) { Sync precSync(&bcb->bcb_syncPrecedence, "purgePrecedence"); precSync.lock(SYNC_EXCLUSIVE); QUE que_prec = bdb->bdb_higher.que_forward, next_prec; for (; que_prec != &bdb->bdb_higher; que_prec = next_prec) { next_prec = que_prec->que_forward; Precedence* precedence = BLOCK(que_prec, Precedence, pre_higher); if (precedence->pre_flags & PRE_cleared) { QUE_DELETE(precedence->pre_higher); QUE_DELETE(precedence->pre_lower); precedence->pre_hi = (BufferDesc*) bcb->bcb_free; bcb->bcb_free = precedence; } } } // Collect pages modified by given or system transaction and write it to disk. // See also comments in flushPages. static void flushDirty(thread_db* tdbb, SLONG transaction_mask, const bool sys_only) { SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); BufferControl* bcb = dbb->dbb_bcb; Firebird::HalfStaticArray flush; { // dirtySync scope Sync dirtySync(&bcb->bcb_syncDirtyBdbs, "flushDirty"); dirtySync.lock(SYNC_EXCLUSIVE); QUE que_inst = bcb->bcb_dirty.que_forward, next; for (; que_inst != &bcb->bcb_dirty; que_inst = next) { next = que_inst->que_forward; BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_dirty); if (!(bdb->bdb_flags & BDB_dirty)) { removeDirty(bcb, bdb); continue; } if ((transaction_mask & bdb->bdb_transactions) || (bdb->bdb_flags & BDB_system_dirty) || (!transaction_mask && !sys_only) || (!bdb->bdb_transactions)) { flush.add(bdb); } } } flushPages(tdbb, FLUSH_TRAN, flush.begin(), flush.getCount()); } // Collect pages modified by garbage collector or all dirty pages or release page // locks - depending of flush_flag, and write it to disk. // See also comments in flushPages. static void flushAll(thread_db* tdbb, USHORT flush_flag) { SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); BufferControl* bcb = dbb->dbb_bcb; Firebird::HalfStaticArray flush(bcb->bcb_dirty_count); const bool all_flag = (flush_flag & FLUSH_ALL) != 0; const bool sweep_flag = (flush_flag & FLUSH_SWEEP) != 0; const bool release_flag = (flush_flag & FLUSH_RLSE) != 0; const bool write_thru = release_flag; for (ULONG i = 0; i < bcb->bcb_count; i++) { BufferDesc* bdb = bcb->bcb_rpt[i].bcb_bdb; if (bdb->bdb_flags & (BDB_db_dirty | BDB_dirty)) { if (bdb->bdb_flags & BDB_dirty) flush.add(bdb); else if (bdb->bdb_flags & BDB_db_dirty) { // pages modified by sweep\garbage collector are not in dirty list const bool dirty_list = (bdb->bdb_dirty.que_forward != &bdb->bdb_dirty); if (all_flag || (sweep_flag && !dirty_list)) flush.add(bdb); } } else if (release_flag) { bdb->addRef(tdbb, SYNC_EXCLUSIVE); if (bdb->bdb_use_count > 1) BUGCHECK(210); // msg 210 page in use during flush PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); bdb->release(tdbb, false); } } flushPages(tdbb, flush_flag, flush.begin(), flush.getCount()); } // Used in qsort below extern "C" { static int cmpBdbs(const void* a, const void* b) { const BufferDesc* bdbA = *(BufferDesc**) a; const BufferDesc* bdbB = *(BufferDesc**) b; if (bdbA->bdb_page > bdbB->bdb_page) return 1; if (bdbA->bdb_page < bdbB->bdb_page) return -1; return 0; } } // extern C // Write array of pages to disk in efficient order. // First, sort pages by their numbers to make writes physically ordered and // thus faster. At every iteration of while loop write pages which have no high // precedence pages to ensure order preserved. If after some iteration there are // no such pages (i.e. all of not written yet pages have high precedence pages) // then write them all at last iteration (of course write_buffer will also check // for precedence before write). static void flushPages(thread_db* tdbb, USHORT flush_flag, BufferDesc** begin, FB_SIZE_T count) { FbStatusVector* const status = tdbb->tdbb_status_vector; const bool all_flag = (flush_flag & FLUSH_ALL) != 0; const bool release_flag = (flush_flag & FLUSH_RLSE) != 0; const bool write_thru = release_flag; qsort(begin, count, sizeof(BufferDesc*), cmpBdbs); MarkIterator iter(begin, count); FB_SIZE_T written = 0; bool writeAll = false; while (!iter.isEmpty()) { bool found = false; for (; !iter.isEof(); ++iter) { BufferDesc* bdb = *iter; fb_assert(bdb); if (!bdb) continue; bdb->addRef(tdbb, release_flag ? SYNC_EXCLUSIVE : SYNC_SHARED); BufferControl* bcb = bdb->bdb_bcb; if (!writeAll) purgePrecedence(bcb, bdb); if (writeAll || QUE_EMPTY(bdb->bdb_higher)) { if (release_flag) { if (bdb->bdb_use_count > 1) BUGCHECK(210); // msg 210 page in use during flush } if (!all_flag || bdb->bdb_flags & (BDB_db_dirty | BDB_dirty)) { if (!write_buffer(tdbb, bdb, bdb->bdb_page, write_thru, status, true)) CCH_unwind(tdbb, true); } // release lock before losing control over bdb, it prevents // concurrent operations on released lock if (release_flag) PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); bdb->release(tdbb, !release_flag && !(bdb->bdb_flags & BDB_dirty)); iter.mark(); found = true; written++; } else { bdb->release(tdbb, false); } } if (!found) writeAll = true; iter.rewind(); } fb_assert(count == written); } #ifdef CACHE_READER void BufferControl::cache_reader(BufferControl* bcb) { /************************************** * * c a c h e _ r e a d e r * ************************************** * * Functional description * Prefetch pages into cache for sequential scans. * Use asynchronous I/O to keep two prefetch requests * busy at a time. * **************************************/ Database* dbb = bcb->bcb_database; Database::SyncGuard dsGuard(dbb); FbLocalStatus status_vector; // Establish a thread context. ThreadContextHolder tdbb(status_vector); // Dummy attachment needed for lock owner identification. tdbb->setDatabase(dbb); Jrd::Attachment* const attachment = Attachment::create(dbb); tdbb->setAttachment(attachment); attachment->att_filename = dbb->dbb_filename; Jrd::ContextPoolHolder context(tdbb, bcb->bcb_bufferpool); // This try block is specifically to protect the LCK_init call: if // LCK_init fails we won't be able to accomplish anything anyway, so // return, unlike the other try blocks further down the page. BufferControl* bcb = NULL; try { LCK_init(tdbb, LCK_OWNER_attachment); TRA_init(attachment); bcb = dbb->dbb_bcb; bcb->bcb_flags |= BCB_cache_reader; dbb->dbb_reader_init.post(); // Notify our creator that we have started } catch (const Firebird::Exception& ex) { ex.stuffException(status_vector); iscDbLogStatus(dbb->dbb_filename.c_str(), status_vector); return 0; } try { // Set up dual prefetch control blocks to keep multiple prefetch // requests active at a time. Also helps to prevent the cache reader // from becoming dedicated or locked into a single request's prefetch demands. prf prefetch1, prefetch2; prefetch_init(&prefetch1, tdbb); prefetch_init(&prefetch2, tdbb); while (bcb->bcb_flags & BCB_cache_reader) { bcb->bcb_flags |= BCB_reader_active; bool found = false; SLONG starting_page = -1; prf* next_prefetch = &prefetch1; if (dbb->dbb_flags & DBB_suspend_bgio) { Database::Checkout dcoHolder(dbb, FB_FUNCTION); dbb->dbb_reader_sem.tryEnter(10); continue; } // Make a pass thru the global prefetch bitmap looking for something // to read. When the entire bitmap has been scanned and found to be // empty then wait for new requests. do { if (!(prefetch1.prf_flags & PRF_active) && SBM_next(bcb->bcb_prefetch, &starting_page, RSE_get_forward)) { found = true; prefetch_prologue(&prefetch1, &starting_page); prefetch_io(&prefetch1, status_vector); } if (!(prefetch2.prf_flags & PRF_active) && SBM_next(bcb->bcb_prefetch, &starting_page, RSE_get_forward)) { found = true; prefetch_prologue(&prefetch2, &starting_page); prefetch_io(&prefetch2, status_vector); } prf* post_prefetch = next_prefetch; next_prefetch = (post_prefetch == &prefetch1) ? &prefetch2 : &prefetch1; if (post_prefetch->prf_flags & PRF_active) prefetch_epilogue(post_prefetch, status_vector); if (found) { // If the cache writer or garbage collector is idle, put // them to work prefetching pages. #ifdef CACHE_WRITER if ((bcb->bcb_flags & BCB_cache_writer) && !(bcb->bcb_flags & BCB_writer_active)) dbb_writer_sem.release(); #endif #ifdef GARBAGE_THREAD if ((dbb->dbb_flags & DBB_garbage_collector) && !(dbb->dbb_flags & DBB_gc_active)) dbb->dbb_gc_sem.release(); #endif } } while (prefetch1.prf_flags & PRF_active || prefetch2.prf_flags & PRF_active); // If there's more work to do voluntarily ask to be rescheduled. // Otherwise, wait for event notification. BufferDesc* bdb; if (found) JRD_reschedule(tdbb, 0, true); else if (bcb->bcb_flags & BCB_free_pending && (bdb = get_buffer(tdbb, FREE_PAGE, LATCH_none, 1))) { // In our spare time, help writer clean the cache. write_buffer(tdbb, bdb, bdb->bdb_page, true, status_vector, true); } else { bcb->bcb_flags &= ~BCB_reader_active; Database::Checkout dcoHolder(dbb, FB_FUNCTION); dbb->dbb_reader_sem.tryEnter(10); } bcb = dbb->dbb_bcb; } LCK_fini(tdbb, LCK_OWNER_attachment); Jrd::Attachment::destroy(attachment); // no need saving warning error strings here tdbb->setAttachment(NULL); bcb->bcb_flags &= ~BCB_cache_reader; dbb->dbb_reader_fini.post(); } // try catch (const Firebird::Exception& ex) { ex.stuffException(status_vector); iscDbLogStatus(dbb->dbb_filename.c_str(), status_vector); } return 0; } #endif void BufferControl::cache_writer(BufferControl* bcb) { /************************************** * * c a c h e _ w r i t e r * ************************************** * * Functional description * Write dirty pages to database to maintain an adequate supply of free pages. * **************************************/ FbLocalStatus status_vector; Database* const dbb = bcb->bcb_database; try { UserId user; user.setUserName("Cache Writer"); Jrd::Attachment* const attachment = Jrd::Attachment::create(dbb); RefPtr sAtt(FB_NEW SysStableAttachment(attachment)); attachment->setStable(sAtt); attachment->att_filename = dbb->dbb_filename; attachment->att_user = &user; BackgroundContextHolder tdbb(dbb, attachment, &status_vector, FB_FUNCTION); try { LCK_init(tdbb, LCK_OWNER_attachment); PAG_header(tdbb, true); PAG_attachment_id(tdbb); TRA_init(attachment); sAtt->initDone(); bcb->bcb_flags |= BCB_cache_writer; bcb->bcb_flags &= ~BCB_writer_start; // Notify our creator that we have started bcb->bcb_writer_init.release(); while (bcb->bcb_flags & BCB_cache_writer) { bcb->bcb_flags |= BCB_writer_active; #ifdef CACHE_READER SLONG starting_page = -1; #endif if (dbb->dbb_flags & DBB_suspend_bgio) { EngineCheckout cout(tdbb, FB_FUNCTION); bcb->bcb_writer_sem.tryEnter(10); continue; } #ifdef SUPERSERVER_V2 // Flush buffers for lazy commit SLONG commit_mask; if (!(dbb->dbb_flags & DBB_force_write) && (commit_mask = dbb->dbb_flush_cycle)) { dbb->dbb_flush_cycle = 0; btc_flush(tdbb, commit_mask, false, status_vector); } #endif if (bcb->bcb_flags & BCB_free_pending) { BufferDesc* const bdb = get_buffer(tdbb, FREE_PAGE, SYNC_NONE, 1); if (bdb) write_buffer(tdbb, bdb, bdb->bdb_page, true, &status_vector, true); } // If there's more work to do voluntarily ask to be rescheduled. // Otherwise, wait for event notification. if ((bcb->bcb_flags & BCB_free_pending) || dbb->dbb_flush_cycle) { JRD_reschedule(tdbb, 0, true); } #ifdef CACHE_READER else if (SBM_next(bcb->bcb_prefetch, &starting_page, RSE_get_forward)) { // Prefetch some pages in our spare time and in the process // garbage collect the prefetch bitmap. prf prefetch; prefetch_init(&prefetch, tdbb); prefetch_prologue(&prefetch, &starting_page); prefetch_io(&prefetch, status_vector); prefetch_epilogue(&prefetch, status_vector); } #endif else { bcb->bcb_flags &= ~BCB_writer_active; EngineCheckout cout(tdbb, FB_FUNCTION); bcb->bcb_writer_sem.tryEnter(10); } } } catch (const Firebird::Exception& ex) { ex.stuffException(&status_vector); iscDbLogStatus(dbb->dbb_filename.c_str(), &status_vector); // continue execution to clean up } Monitoring::cleanupAttachment(tdbb); attachment->releaseLocks(tdbb); LCK_fini(tdbb, LCK_OWNER_attachment); attachment->releaseRelations(tdbb); } // try catch (const Firebird::Exception& ex) { bcb->exceptionHandler(ex, cache_writer); } bcb->bcb_flags &= ~BCB_cache_writer; try { if (bcb->bcb_flags & BCB_writer_start) { bcb->bcb_flags &= ~BCB_writer_start; bcb->bcb_writer_init.release(); } } catch (const Firebird::Exception& ex) { bcb->exceptionHandler(ex, cache_writer); } } void BufferControl::exceptionHandler(const Firebird::Exception& ex, BcbThreadSync::ThreadRoutine*) { FbLocalStatus status_vector; ex.stuffException(&status_vector); iscDbLogStatus(bcb_database->dbb_filename.c_str(), &status_vector); } static void check_precedence(thread_db* tdbb, WIN* window, PageNumber page) { /************************************** * * c h e c k _ p r e c e d e n c e * ************************************** * * Functional description * Given a window accessed for write and a page number, * establish a precedence relationship such that the * specified page will always be written before the page * associated with the window. * * If the "page number" is negative, it is really a transaction * id. In this case, the precedence relationship is to the * database header page from which the transaction id was * obtained. If the header page has been written since the * transaction id was assigned, no precedence relationship * is required. * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); BufferControl* bcb = dbb->dbb_bcb; // If this is really a transaction id, sort things out switch(page.getPageSpaceID()) { case DB_PAGE_SPACE: break; case TRANS_PAGE_SPACE: if (page.getPageNum() <= tdbb->getDatabase()->dbb_last_header_write) return; page = PageNumber(DB_PAGE_SPACE, 0); break; default: fb_assert(false); return; } // In the past negative value, passed here, meant not page, but transaction number. // When we finally move to 32 (not 31) bit page numbers, this should be removed, // but currently I add: fb_assert(!(page.getPageNum() & 0x80000000)); // to help detect cases, when something possibly negative is passed. // AP - 2011. // Start by finding the buffer containing the high priority page Sync bcbSync(&bcb->bcb_syncObject, "check_precedence"); bcbSync.lock(SYNC_SHARED); BufferDesc* high = find_buffer(bcb, page, false); bcbSync.unlock(); if (!high) return; // Found the higher precedence buffer. If it's not dirty, don't sweat it. // If it's the same page, ditto. if (!(high->bdb_flags & BDB_dirty) || (high->bdb_page == window->win_page)) return; BufferDesc* low = window->win_bdb; if ((low->bdb_flags & BDB_marked) && !(low->bdb_flags & BDB_faked)) BUGCHECK(212); // msg 212 CCH_precedence: block marked // If already related, there's nothing more to do. If the precedence // search was too complex to complete, just write the high page and // forget about about establishing the relationship. Sync precSync(&bcb->bcb_syncPrecedence, "check_precedence"); precSync.lock(SYNC_EXCLUSIVE); if (QUE_NOT_EMPTY(high->bdb_lower)) { const ULONG mark = get_prec_walk_mark(bcb); const SSHORT relationship = related(low, high, PRE_SEARCH_LIMIT, mark); if (relationship == PRE_EXISTS) return; if (relationship == PRE_UNKNOWN) { precSync.unlock(); const PageNumber high_page = high->bdb_page; if (!write_buffer(tdbb, high, high_page, false, tdbb->tdbb_status_vector, true)) CCH_unwind(tdbb, true); return; } } // Check to see if we're going to create a cycle or the precedence search // was too complex to complete. If so, force a write of the "after" // (currently fetched) page. Assuming everyone obeys the rules and calls // precedence before marking the buffer, everything should be ok while (QUE_NOT_EMPTY(low->bdb_lower)) { const ULONG mark = get_prec_walk_mark(bcb); const SSHORT relationship = related(high, low, PRE_SEARCH_LIMIT, mark); if (relationship == PRE_EXISTS || relationship == PRE_UNKNOWN) { precSync.unlock(); const PageNumber low_page = low->bdb_page; if (!write_buffer(tdbb, low, low_page, false, tdbb->tdbb_status_vector, true)) CCH_unwind(tdbb, true); precSync.lock(SYNC_EXCLUSIVE); } else break; } // We're going to establish a new precedence relationship. Get a block, // fill in the appropriate fields, and insert it into the various ques Precedence* precedence = bcb->bcb_free; if (precedence) bcb->bcb_free = (Precedence*) precedence->pre_hi; else precedence = FB_NEW_POOL(*bcb->bcb_bufferpool) Precedence; precedence->pre_low = low; precedence->pre_hi = high; precedence->pre_flags = 0; QUE_INSERT(low->bdb_higher, precedence->pre_higher); QUE_INSERT(high->bdb_lower, precedence->pre_lower); // explicitly include high page in system transaction flush process if (low->bdb_flags & BDB_system_dirty && high->bdb_flags & BDB_dirty) high->bdb_flags |= BDB_system_dirty; } static void clear_precedence(thread_db* tdbb, BufferDesc* bdb) { /************************************** * * c l e a r _ p r e c e d e n c e * ************************************** * * Functional description * Clear precedence relationships to lower precedence block. * **************************************/ SET_TDBB(tdbb); if (QUE_EMPTY(bdb->bdb_lower)) return; BufferControl* const bcb = bdb->bdb_bcb; Sync precSync(&bcb->bcb_syncPrecedence, "clear_precedence"); if (!bcb->bcb_syncPrecedence.ourExclusiveLock()) precSync.lock(SYNC_EXCLUSIVE); // Loop thru lower precedence buffers. If any can be downgraded, // by all means down grade them. while (QUE_NOT_EMPTY(bdb->bdb_lower)) { QUE que_inst = bdb->bdb_lower.que_forward; Precedence* precedence = BLOCK(que_inst, Precedence, pre_lower); BufferDesc* low_bdb = precedence->pre_low; QUE_DELETE(precedence->pre_higher); QUE_DELETE(precedence->pre_lower); precedence->pre_hi = (BufferDesc*) bcb->bcb_free; bcb->bcb_free = precedence; if (!(precedence->pre_flags & PRE_cleared)) { if (low_bdb->bdb_ast_flags & BDB_blocking) PAGE_LOCK_RE_POST(tdbb, bcb, low_bdb->bdb_lock); } } } static BufferDesc* dealloc_bdb(BufferDesc* bdb) { /************************************** * * d e a l l o c _ b d b * ************************************** * * Functional description * Deallocate buffer descriptor block. * **************************************/ if (bdb) { delete bdb->bdb_lock; QUE_DELETE(bdb->bdb_que); delete bdb; } return NULL; } static void down_grade(thread_db* tdbb, BufferDesc* bdb, int high) { /************************************** * * d o w n _ g r a d e * ************************************** * * Functional description * A lock on a page is blocking another process. If possible, downgrade * the lock on the buffer. This may be called from either AST or * regular level. Return true if the down grade was successful. If the * down grade was deferred for any reason, return false. * **************************************/ SET_TDBB(tdbb); const bool oldBlocking = (bdb->bdb_ast_flags.exchangeBitOr(BDB_blocking) & BDB_blocking); Lock* lock = bdb->bdb_lock; Database* dbb = tdbb->getDatabase(); BufferControl* bcb = bdb->bdb_bcb; if (dbb->dbb_flags & DBB_bugcheck) { PAGE_LOCK_RELEASE(tdbb, bcb, lock); bdb->bdb_ast_flags &= ~BDB_blocking; clear_dirty_flag_and_nbak_state(tdbb, bdb); return; // true; } // If the BufferDesc is in use and, being written or already // downgraded to read, mark it as blocking and exit. bool justWrite = false; if (bdb->isLocked() || !bdb->addRefConditional(tdbb, SYNC_EXCLUSIVE)) { if (!high) return; // false; // hvlad: buffer is in use and we can't downgrade its lock. But if this // buffer blocks some lower precedence buffer, it is enough to just write // our (high) buffer to clear precedence and thus allow blocked (low) // buffer to be downgraded. IO lock guarantees that there is no buffer // modification in progress currently and it is safe to write it right now. // No need to mark our buffer as blocking nor to change state of our lock. bdb->lockIO(tdbb); if (!(bdb->bdb_flags & BDB_dirty)) { bdb->unLockIO(tdbb); return; // true } if (!oldBlocking) bdb->bdb_ast_flags &= ~BDB_blocking; justWrite = true; } // If the page isn't dirty, the lock can be quietly downgraded. if (!(bdb->bdb_flags & BDB_dirty)) { bdb->bdb_ast_flags &= ~BDB_blocking; LCK_downgrade(tdbb, lock); bdb->release(tdbb, false); return; // true; } bool in_use = false, invalid = false; if (bdb->bdb_flags & BDB_not_valid) invalid = true; // If there are higher precedence guys, see if they can be written. while (QUE_NOT_EMPTY(bdb->bdb_higher)) { // syncPrec scope Sync syncPrec(&bcb->bcb_syncPrecedence, "down_grade"); syncPrec.lock(SYNC_EXCLUSIVE); bool found = false; for (QUE que_inst = bdb->bdb_higher.que_forward; que_inst != &bdb->bdb_higher; que_inst = que_inst->que_forward) { Precedence* precedence = BLOCK(que_inst, Precedence, pre_higher); if (precedence->pre_flags & PRE_cleared) continue; if (invalid) { precedence->pre_flags |= PRE_cleared; continue; } BufferDesc* blocking_bdb = precedence->pre_hi; if (blocking_bdb->bdb_flags & BDB_dirty) { found = true; syncPrec.unlock(); down_grade(tdbb, blocking_bdb, high + 1); if ((blocking_bdb->bdb_flags & BDB_dirty) && !(precedence->pre_flags & PRE_cleared)) in_use = true; if (blocking_bdb->bdb_flags & BDB_not_valid) { invalid = true; in_use = false; que_inst = bdb->bdb_higher.que_forward; } break; } } // If any higher precedence buffer can't be written, mark this buffer as blocking and exit. if (in_use) { if (justWrite) bdb->unLockIO(tdbb); else bdb->release(tdbb, false); return; // false; } if (!found) break; } // syncPrec scope // Everything is clear to write this buffer. Do so and reduce the lock if (!justWrite) bdb->lockIO(tdbb); const bool written = !(bdb->bdb_flags & BDB_dirty) || write_page(tdbb, bdb, tdbb->tdbb_status_vector, true); if (!justWrite) bdb->unLockIO(tdbb); if (invalid || !written) { bdb->bdb_flags |= BDB_not_valid; clear_dirty_flag_and_nbak_state(tdbb, bdb); bdb->bdb_ast_flags &= ~BDB_blocking; TRA_invalidate(tdbb, bdb->bdb_transactions); bdb->bdb_transactions = 0; PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); } else if (!justWrite) { bdb->bdb_ast_flags &= ~BDB_blocking; LCK_downgrade(tdbb, lock); } // Clear precedence relationships to lower precedence buffers. Since it // isn't safe to tweak the que pointers from AST level, just mark the precedence // links as cleared. Somebody else will clean up the precedence blocks. while (QUE_NOT_EMPTY(bdb->bdb_lower)) { // syncPrec scope Sync syncPrec(&bcb->bcb_syncPrecedence, "down_grade"); syncPrec.lock(SYNC_EXCLUSIVE); bool found = false; for (QUE que_inst = bdb->bdb_lower.que_forward; que_inst != &bdb->bdb_lower; que_inst = que_inst->que_forward) { Precedence* precedence = BLOCK(que_inst, Precedence, pre_lower); if (precedence->pre_flags & PRE_cleared) continue; BufferDesc* blocking_bdb = precedence->pre_low; if (bdb->bdb_flags & BDB_not_valid) { blocking_bdb->bdb_flags |= BDB_not_valid; } precedence->pre_flags |= PRE_cleared; if ((blocking_bdb->bdb_flags & BDB_not_valid) || (blocking_bdb->bdb_ast_flags & BDB_blocking)) { found = true; syncPrec.unlock(); down_grade(tdbb, blocking_bdb); break; } } if (!found) break; } // syncPrec scope bdb->bdb_flags &= ~BDB_not_valid; if (justWrite) bdb->unLockIO(tdbb); else bdb->release(tdbb, false); return; // true; } static bool expand_buffers(thread_db* tdbb, ULONG number) { /************************************** * * e x p a n d _ b u f f e r s * ************************************** * * Functional description * Expand the cache to at least a given number of buffers. If * it's already that big, don't do anything. * * Nickolay Samofatov, 08-Mar-2004. * This function does not handle exceptions correctly, * it looks like good handling requires rewrite. * **************************************/ SET_TDBB(tdbb); Database* const dbb = tdbb->getDatabase(); BufferControl* const bcb = dbb->dbb_bcb; if (number <= bcb->bcb_count || number > MAX_PAGE_BUFFERS) return false; Sync syncBcb(&bcb->bcb_syncObject, "expand_buffers"); syncBcb.lock(SYNC_EXCLUSIVE); // for Win16 platform, we want to ensure that no cache buffer ever ends on a segment boundary // CVC: Is this code obsolete or only the comment? ULONG num_per_seg = number - bcb->bcb_count; ULONG left_to_do = num_per_seg; // Allocate and initialize buffers control block Jrd::ContextPoolHolder context(tdbb, bcb->bcb_bufferpool); const bcb_repeat* const old_end = bcb->bcb_rpt + bcb->bcb_count; bcb_repeat* const new_rpt = FB_NEW_POOL(*bcb->bcb_bufferpool) bcb_repeat[number]; bcb_repeat* const old_rpt = bcb->bcb_rpt; bcb->bcb_rpt = new_rpt; bcb->bcb_count = number; bcb->bcb_free_minimum = (SSHORT) MIN(number / 4, 128); /* 25% clean page reserve */ const bcb_repeat* const new_end = bcb->bcb_rpt + number; // Initialize tail of new buffer control block bcb_repeat* new_tail; for (new_tail = bcb->bcb_rpt; new_tail < new_end; new_tail++) QUE_INIT(new_tail->bcb_page_mod); // Move any active buffers from old block to new new_tail = bcb->bcb_rpt; for (bcb_repeat* old_tail = old_rpt; old_tail < old_end; old_tail++, new_tail++) { new_tail->bcb_bdb = old_tail->bcb_bdb; while (QUE_NOT_EMPTY(old_tail->bcb_page_mod)) { QUE que_inst = old_tail->bcb_page_mod.que_forward; BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_que); QUE_DELETE(*que_inst); QUE mod_que = &bcb->bcb_rpt[bdb->bdb_page.getPageNum() % bcb->bcb_count].bcb_page_mod; QUE_INSERT(*mod_que, *que_inst); } } // Allocate new buffer descriptor blocks ULONG num_in_seg = 0; UCHAR* memory = NULL; for (; new_tail < new_end; new_tail++) { // if current segment is exhausted, allocate another if (!num_in_seg) { const size_t alloc_size = dbb->dbb_page_size * (num_per_seg + 1); memory = (UCHAR*) bcb->bcb_bufferpool->allocate(alloc_size ALLOC_ARGS); bcb->bcb_memory.push(memory); memory = FB_ALIGN(memory, dbb->dbb_page_size); num_in_seg = num_per_seg; left_to_do -= num_per_seg; if (num_per_seg > left_to_do) num_per_seg = left_to_do; } new_tail->bcb_bdb = alloc_bdb(tdbb, bcb, &memory); num_in_seg--; } // Set up new buffer control, release old buffer control, and clean up delete[] old_rpt; return true; } static BufferDesc* find_buffer(BufferControl* bcb, const PageNumber page, bool findPending) { QUE mod_que = &bcb->bcb_rpt[page.getPageNum() % bcb->bcb_count].bcb_page_mod; QUE que_inst = mod_que->que_forward; for (; que_inst != mod_que; que_inst = que_inst->que_forward) { BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_que); if (bdb->bdb_page == page) return bdb; } if (findPending) { que_inst = bcb->bcb_pending.que_forward; for (; que_inst != &bcb->bcb_pending; que_inst = que_inst->que_forward) { BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_que); if (bdb->bdb_page == page || bdb->bdb_pending_page == page) return bdb; } } return NULL; } static LatchState latch_buffer(thread_db* tdbb, Sync &bcbSync, BufferDesc *bdb, const PageNumber page, SyncType syncType, int wait) { //++bdb->bdb_use_count; if (!(bdb->bdb_flags & BDB_free_pending) #ifdef SUPERSERVER_V2 && (page != HEADER_PAGE_NUMBER) #endif ) { recentlyUsed(bdb); } // If buffer is currently replacing by another page but still writting // to disk we should wait until this write finished, else we could // allocate another buffer and read old page image (or even zero's) // from disk into new buffer const bool waitPending = ((bdb->bdb_flags & BDB_free_pending) && bdb->bdb_page == page); bcbSync.unlock(); if (waitPending) { //--bdb->bdb_use_count; if (wait == 0) return lsTimeout; // go out Thread::yield(); } else { const bool latchOk = bdb->addRef(tdbb, syncType, wait); //--bdb->bdb_use_count; if (!latchOk) return lsTimeout; // go out if (bdb->bdb_page == page) { //bdb->bdb_flags &= ~(BDB_faked | BDB_prefetch); return lsOk; } bdb->release(tdbb, true); } return lsPageChanged; // try again } static BufferDesc* get_buffer(thread_db* tdbb, const PageNumber page, SyncType syncType, int wait) { /************************************** * * g e t _ b u f f e r * ************************************** * * Functional description * Get a buffer. If possible, get a buffer already assigned * to the page. Otherwise get one from the free list or pick * the least recently used buffer to be reused. * Note the following special page numbers: * -1 indicates that a buffer is required for journaling => obsolete * -2 indicates a special scratch buffer for shadowing * * input * page: page to get * syncType: type of lock to acquire on the page. * wait: 1 => Wait as long as necessary to get the lock. * This can cause deadlocks of course. * 0 => If the lock can't be acquired immediately, * give up and return 0; * => Latch timeout interval in seconds. * * return * BufferDesc pointer if successful. * NULL pointer if timeout occurred (only possible is wait <> 1). * if cache manager doesn't have any pages to write anymore. * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->getDatabase(); BufferControl* bcb = dbb->dbb_bcb; Sync bcbSync(&bcb->bcb_syncObject, "get_buffer"); if (page != FREE_PAGE) { bcbSync.lock(SYNC_SHARED); BufferDesc* bdb = find_buffer(bcb, page, true); while (bdb) { const LatchState ret = latch_buffer(tdbb, bcbSync, bdb, page, syncType, wait); if (ret == lsOk) { tdbb->bumpStats(RuntimeStatistics::PAGE_FETCHES); return bdb; } if (ret == lsTimeout) return NULL; bcbSync.lock(SYNC_SHARED); bdb = find_buffer(bcb, page, true); } bcbSync.unlock(); } bcbSync.lock(SYNC_EXCLUSIVE); QUE que_inst; int walk = bcb->bcb_free_minimum; while (true) { if (page != FREE_PAGE) { // Check to see if buffer has already been assigned to page BufferDesc* bdb = find_buffer(bcb, page, true); while (bdb) { const LatchState ret = latch_buffer(tdbb, bcbSync, bdb, page, syncType, wait); if (ret == lsOk) { tdbb->bumpStats(RuntimeStatistics::PAGE_FETCHES); return bdb; } if (ret == lsTimeout) return NULL; bcbSync.lock(SYNC_EXCLUSIVE); bdb = find_buffer(bcb, page, true); } } else // page == FREE_PAGE { // This code is only used by the background I/O threads: // cache writer, cache reader and garbage collector. //Database::Checkout dcoHolder(dbb); Sync lruSync(&bcb->bcb_syncLRU, "get_buffer"); lruSync.lock(SYNC_EXCLUSIVE); for (que_inst = bcb->bcb_in_use.que_backward; que_inst != &bcb->bcb_in_use; que_inst = que_inst->que_backward) { BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_in_use); if (bdb->bdb_use_count || (bdb->bdb_flags & BDB_free_pending)) continue; if (bdb->bdb_flags & BDB_db_dirty) { //tdbb->bumpStats(RuntimeStatistics::PAGE_FETCHES); shouldn't it be here? return bdb; } if (!--walk) { bcb->bcb_flags &= ~BCB_free_pending; break; } } // hvlad: removed in Vulcan bcb->bcb_flags &= ~BCB_free_pending; return NULL; } // If there is an empty buffer sitting around, allocate it if (QUE_NOT_EMPTY(bcb->bcb_empty)) { que_inst = bcb->bcb_empty.que_forward; QUE_DELETE(*que_inst); BufferDesc* bdb = BLOCK(que_inst, BufferDesc, bdb_que); bcb->bcb_inuse++; bdb->addRef(tdbb, SYNC_EXCLUSIVE); if (page != FREE_PAGE) { QUE mod_que = &bcb->bcb_rpt[page.getPageNum() % bcb->bcb_count].bcb_page_mod; QUE_INSERT(*mod_que, *que_inst); #ifdef SUPERSERVER_V2 // Reserve a buffer for header page with deferred header // page write mechanism. Otherwise, a deadlock will occur // if all dirty pages in the cache must force header page // to disk before they can be written but there is no free // buffer to read the header page into. if (page != HEADER_PAGE_NUMBER) #endif { Sync lruSync(&bcb->bcb_syncLRU, "get_buffer"); lruSync.lock(SYNC_EXCLUSIVE); QUE_INSERT(bcb->bcb_in_use, bdb->bdb_in_use); } } // This correction for bdb_use_count below is needed to // avoid a deadlock situation in latching code. It's not // clear though how the bdb_use_count can get < 0 for a bdb // in bcb_empty queue if (bdb->bdb_use_count < 0) BUGCHECK(301); // msg 301 Non-zero use_count of a buffer in the empty que_inst bdb->bdb_page = page; bdb->bdb_flags = BDB_read_pending; // we have buffer exclusively, this is safe bdb->bdb_scan_count = 0; if (page != FREE_PAGE) { CCH_TRACE(("bdb->bdb_lock->lck_logical = LCK_none; page=%i", bdb->bdb_page.getPageNum())); bdb->bdb_lock->lck_logical = LCK_none; } else PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); tdbb->bumpStats(RuntimeStatistics::PAGE_FETCHES); return bdb; } Sync lruSync(&bcb->bcb_syncLRU, "get_buffer"); lruSync.lock(SYNC_EXCLUSIVE); if (bcb->bcb_lru_chain.load() != NULL) requeueRecentlyUsed(bcb); for (que_inst = bcb->bcb_in_use.que_backward; que_inst != &bcb->bcb_in_use; que_inst = que_inst->que_backward) { // get the oldest buffer as the least recently used -- note // that since there are no empty buffers this queue cannot be empty if (bcb->bcb_in_use.que_forward == &bcb->bcb_in_use) BUGCHECK(213); // msg 213 insufficient cache size BufferDesc* oldest = BLOCK(que_inst, BufferDesc, bdb_in_use); if (oldest->bdb_flags & BDB_lru_chained) continue; if (oldest->bdb_use_count || !oldest->addRefConditional(tdbb, SYNC_EXCLUSIVE)) continue; if ((oldest->bdb_flags & BDB_free_pending) || !writeable(oldest)) { oldest->release(tdbb, true); continue; } #ifdef SUPERSERVER_V2 // If page has been prefetched but not yet fetched, let // it cycle once more thru LRU queue before re-using it. if (oldest->bdb_flags & BDB_prefetch) { oldest->bdb_flags &= ~BDB_prefetch; que_inst = que_inst->que_forward; QUE_MOST_RECENTLY_USED(oldest->bdb_in_use); //LATCH_MUTEX_RELEASE; continue; } #endif if ((bcb->bcb_flags & BCB_cache_writer) && (oldest->bdb_flags & (BDB_dirty | BDB_db_dirty)) ) { bcb->bcb_flags |= BCB_free_pending; if (!(bcb->bcb_flags & BCB_writer_active)) bcb->bcb_writer_sem.release(); if (walk) { oldest->release(tdbb, true); if (!--walk) break; continue; } } BufferDesc* bdb = oldest; // hvlad: we already have bcb_lruSync here //recentlyUsed(bdb); fb_assert(!(bdb->bdb_flags & BDB_lru_chained)); QUE_DELETE(bdb->bdb_in_use); QUE_INSERT(bcb->bcb_in_use, bdb->bdb_in_use); lruSync.unlock(); bdb->bdb_flags |= BDB_free_pending; bdb->bdb_pending_page = page; QUE_DELETE(bdb->bdb_que); QUE_INSERT(bcb->bcb_pending, bdb->bdb_que); const bool needCleanup = (bdb->bdb_flags & (BDB_dirty | BDB_db_dirty)) || QUE_NOT_EMPTY(bdb->bdb_higher) || QUE_NOT_EMPTY(bdb->bdb_lower); if (needCleanup) { bcbSync.unlock(); // If the buffer selected is dirty, arrange to have it written. if (bdb->bdb_flags & (BDB_dirty | BDB_db_dirty)) { const bool write_thru = (bcb->bcb_flags & BCB_exclusive); if (!write_buffer(tdbb, bdb, bdb->bdb_page, write_thru, tdbb->tdbb_status_vector, true)) { bcbSync.lock(SYNC_EXCLUSIVE); bdb->bdb_flags &= ~BDB_free_pending; QUE_DELETE(bdb->bdb_in_use); QUE_APPEND(bcb->bcb_in_use, bdb->bdb_in_use); bcbSync.unlock(); bdb->release(tdbb, true); CCH_unwind(tdbb, true); } } // If the buffer is still in the dirty tree, remove it. // In any case, release any lock it may have. removeDirty(bcb, bdb); // Cleanup any residual precedence blocks. Unless something is // screwed up, the only precedence blocks that can still be hanging // around are ones cleared at AST level. if (QUE_NOT_EMPTY(bdb->bdb_higher) || QUE_NOT_EMPTY(bdb->bdb_lower)) { Sync precSync(&bcb->bcb_syncPrecedence, "get_buffer"); precSync.lock(SYNC_EXCLUSIVE); while (QUE_NOT_EMPTY(bdb->bdb_higher)) { QUE que2 = bdb->bdb_higher.que_forward; Precedence* precedence = BLOCK(que2, Precedence, pre_higher); QUE_DELETE(precedence->pre_higher); QUE_DELETE(precedence->pre_lower); precedence->pre_hi = (BufferDesc*) bcb->bcb_free; bcb->bcb_free = precedence; } clear_precedence(tdbb, bdb); } bcbSync.lock(SYNC_EXCLUSIVE); } QUE_DELETE(bdb->bdb_que); // bcb_pending QUE mod_que = &bcb->bcb_rpt[page.getPageNum() % bcb->bcb_count].bcb_page_mod; QUE_INSERT((*mod_que), bdb->bdb_que); bdb->bdb_flags &= ~BDB_free_pending; // This correction for bdb_use_count below is needed to // avoid a deadlock situation in latching code. It's not // clear though how the bdb_use_count can get < 0 for a bdb // in bcb_empty queue if (bdb->bdb_use_count < 0) BUGCHECK(301); /* msg 301 Non-zero use_count of a buffer in the empty Que */ bdb->bdb_page = page; bdb->bdb_flags &= BDB_lru_chained; // yes, clear all except BDB_lru_chained bdb->bdb_flags |= BDB_read_pending; bdb->bdb_scan_count = 0; bcbSync.unlock(); if (page != FREE_PAGE) bdb->bdb_lock->lck_logical = LCK_none; else PAGE_LOCK_RELEASE(tdbb, bcb, bdb->bdb_lock); tdbb->bumpStats(RuntimeStatistics::PAGE_FETCHES); return bdb; } if (que_inst == &bcb->bcb_in_use) expand_buffers(tdbb, bcb->bcb_count + 75); } } static ULONG get_prec_walk_mark(BufferControl* bcb) { /************************************** * * g e t _ p r e c _ w a l k _ m a r k * ************************************** * * Functional description * Get next mark for walking precedence graph. * **************************************/ fb_assert(bcb->bcb_syncPrecedence.ourExclusiveLock()); if (++bcb->bcb_prec_walk_mark == 0) { for (ULONG i = 0; i < bcb->bcb_count; i++) bcb->bcb_rpt[i].bcb_bdb->bdb_prec_walk_mark = 0; bcb->bcb_prec_walk_mark = 1; } return bcb->bcb_prec_walk_mark; } static int get_related(BufferDesc* bdb, PagesArray &lowPages, int limit, const ULONG mark) { /************************************** * * g e t _ r e l a t e d * ************************************** * * Functional description * Recursively walk low part of precedence graph of given buffer and put * low pages numbers into array. * **************************************/ BufferControl* bcb = bdb->bdb_bcb; fb_assert(bcb->bcb_syncPrecedence.ourExclusiveLock()); const struct que* base = &bdb->bdb_lower; for (const struct que* que_inst = base->que_forward; que_inst != base; que_inst = que_inst->que_forward) { const Precedence* precedence = BLOCK(que_inst, Precedence, pre_lower); if (precedence->pre_flags & PRE_cleared) continue; BufferDesc* low = precedence->pre_low; if (low->bdb_prec_walk_mark == mark) continue; if (!--limit) return 0; const SLONG lowPage = low->bdb_page.getPageNum(); FB_SIZE_T pos; if (!lowPages.find(lowPage, pos)) lowPages.insert(pos, lowPage); if (QUE_NOT_EMPTY(low->bdb_lower)) { limit = get_related(low, lowPages, limit, mark); if (!limit) return 0; } else low->bdb_prec_walk_mark = mark; } bdb->bdb_prec_walk_mark = mark; return limit; } static LockState lock_buffer(thread_db* tdbb, BufferDesc* bdb, const SSHORT wait, const SCHAR page_type) { /************************************** * * l o c k _ b u f f e r * ************************************** * * Functional description * Get a lock on page for a buffer. If the lock ever slipped * below READ, indicate that the page must be read. * * input: * wait: LCK_WAIT = 1 => Wait as long a necessary to get the lock. * LCK_NO_WAIT = 0 => If the lock can't be acquired immediately, * give up and return -1. * => Lock timeout interval in seconds. * * return: 0 => buffer locked, page is already in memory. * 1 => buffer locked, page needs to be read from disk. * -1 => timeout on lock occurred, see input parameter 'wait'. * **************************************/ SET_TDBB(tdbb); BufferControl* const bcb = bdb->bdb_bcb; fb_assert(!(bcb->bcb_flags & BCB_exclusive)); const USHORT lock_type = (bdb->bdb_flags & (BDB_dirty | BDB_writer)) ? LCK_write : LCK_read; CCH_TRACE(("FE LOCK %d:%06d, %s", bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum(), (lock_type >= LCK_write) ? "EX" : "SH" )); Lock* const lock = bdb->bdb_lock; if (lock->lck_logical >= lock_type) return lsLockedHavePage; TEXT errmsg[MAX_ERRMSG_LEN + 1]; ThreadStatusGuard tempStatus(tdbb); if (lock->lck_logical == LCK_none) { // Prevent header and TIP pages from generating blocking AST // overhead. The promise is that the lock will unconditionally // be released when the buffer use count indicates it is safe to do so. if (page_type == pag_header || page_type == pag_transactions) { fb_assert(lock->lck_ast == blocking_ast_bdb); fb_assert(lock->lck_object == bdb); lock->lck_ast = 0; lock->lck_object = NULL; } else { fb_assert(lock->lck_ast != NULL); } bdb->bdb_page.getLockStr(lock->getKeyPtr()); if (LCK_lock_opt(tdbb, lock, lock_type, wait)) { if (!lock->lck_ast) { // Restore blocking AST to lock block if it was swapped out. // Flag the BufferDesc so that the lock is released when the buffer is released. fb_assert(page_type == pag_header || page_type == pag_transactions); lock->lck_ast = blocking_ast_bdb; lock->lck_object = bdb; bdb->bdb_flags |= BDB_no_blocking_ast; } return lsLocked; } if (!lock->lck_ast) { fb_assert(page_type == pag_header || page_type == pag_transactions); lock->lck_ast = blocking_ast_bdb; lock->lck_object = bdb; } // Case: a timeout was specified, or the caller didn't want to wait, return the error. if ((wait == LCK_NO_WAIT) || ((wait < 0) && (tempStatus->getErrors()[1] == isc_lock_timeout))) { bdb->release(tdbb, false); return lsLockTimeout; } // Case: lock manager detected a deadlock, probably caused by locking the // BufferDesc's in an unfortunate order. Nothing we can do about it, return the // error, and log it to firebird.log. FbStatusVector* const status = tempStatus.restore(); fb_msg_format(0, JRD_BUGCHK, 216, sizeof(errmsg), errmsg, MsgFormat::SafeArg() << bdb->bdb_page.getPageNum() << (int) page_type); ERR_append_status(status, Arg::Gds(isc_random) << Arg::Str(errmsg)); ERR_log(JRD_BUGCHK, 216, errmsg); // // msg 216 page %ld, page type %ld lock denied // CCH_unwind releases all the BufferDesc's and calls ERR_punt() // ERR_punt will longjump. CCH_unwind(tdbb, true); } // Lock requires an upward conversion. Sigh. Try to get the conversion. // If it fails, release the lock and re-seize. Save the contents of the // status vector just in case const LockState must_read = (lock->lck_logical < LCK_read) ? lsLocked : lsLockedHavePage; if (LCK_convert_opt(tdbb, lock, lock_type)) return must_read; if (wait == LCK_NO_WAIT) { bdb->release(tdbb, true); return lsLockTimeout; } if (LCK_lock(tdbb, lock, lock_type, wait)) return lsLocked; // Case: a timeout was specified, or the caller didn't want to wait, return the error. if ((wait < 0) && (tempStatus->getErrors()[1] == isc_lock_timeout)) { bdb->release(tdbb, false); return lsLockTimeout; } // Case: lock manager detected a deadlock, probably caused by locking the // BufferDesc's in an unfortunate order. Nothing we can do about it, return the // error, and log it to firebird.log. FbStatusVector* const status = tempStatus.restore(); fb_msg_format(0, JRD_BUGCHK, 215, sizeof(errmsg), errmsg, MsgFormat::SafeArg() << bdb->bdb_page.getPageNum() << (int) page_type); ERR_append_status(status, Arg::Gds(isc_random) << Arg::Str(errmsg)); ERR_log(JRD_BUGCHK, 215, errmsg); // msg 215 page %ld, page type %ld lock conversion denied CCH_unwind(tdbb, true); return lsError; // Added to get rid of Compiler Warning } static ULONG memory_init(thread_db* tdbb, BufferControl* bcb, SLONG number) { /************************************** * * m e m o r y _ i n i t * ************************************** * * Functional description * Initialize memory for the cache. * Return number of buffers allocated. * **************************************/ SET_TDBB(tdbb); Database* const dbb = tdbb->getDatabase(); UCHAR* memory = NULL; SLONG buffers = 0; const size_t page_size = dbb->dbb_page_size; size_t memory_size = page_size * (number + 1); fb_assert(memory_size > 0); SLONG old_buffers = 0; bcb_repeat* old_tail = NULL; const UCHAR* memory_end = NULL; bcb_repeat* tail = bcb->bcb_rpt; // "end" is changed inside the loop for (const bcb_repeat* end = tail + number; tail < end; tail++) { if (!memory) { // Allocate only what is required for remaining buffers. if (memory_size > (page_size * (number + 1))) memory_size = page_size * (number + 1); while (true) { try { memory = (UCHAR*) bcb->bcb_bufferpool->allocate(memory_size ALLOC_ARGS); break; } catch (Firebird::BadAlloc&) { // Either there's not enough virtual memory or there is // but it's not virtually contiguous. Let's find out by // cutting the size in half to see if the buffers can be // scattered over the remaining virtual address space. memory_size >>= 1; if (memory_size < MIN_BUFFER_SEGMENT) { // Diminishing returns return buffers; } } } bcb->bcb_memory.push(memory); memory_end = memory + memory_size; // Allocate buffers on an address that is an even multiple // of the page size (rather the physical sector size.) This // is a necessary condition to support raw I/O interfaces. memory = FB_ALIGN(memory, page_size); old_tail = tail; old_buffers = buffers; } QUE_INIT(tail->bcb_page_mod); try { tail->bcb_bdb = alloc_bdb(tdbb, bcb, &memory); } catch (Firebird::BadAlloc&) { // Whoops! Time to reset our expectations. Release the buffer memory // but use that memory size to calculate a new number that takes into account // the page buffer overhead. Reduce this number by a 25% fudge factor to // leave some memory for useful work. bcb->bcb_bufferpool->deallocate(bcb->bcb_memory.pop()); memory = NULL; for (bcb_repeat* tail2 = old_tail; tail2 < tail; tail2++) tail2->bcb_bdb = dealloc_bdb(tail2->bcb_bdb); number = static_cast(memory_size / PAGE_OVERHEAD); number -= number >> 2; end = old_tail + number; tail = --old_tail; // For loop continue pops tail above buffers = old_buffers; continue; } buffers++; // Allocated buffers number--; // Remaining buffers // Check if memory segment has been exhausted. if (memory + page_size > memory_end) memory = 0; } return buffers; } static void page_validation_error(thread_db* tdbb, WIN* window, SSHORT type) { /************************************** * * p a g e _ v a l i d a t i o n _ e r r o r * ************************************** * * Functional description * We've detected a validation error on fetch. Generally * we've detected that the type of page fetched didn't match the * type of page we were expecting. Report an error and * get out. * This function will only be called rarely, as a page validation * error is an indication of on-disk database corruption. * **************************************/ SET_TDBB(tdbb); BufferDesc* bdb = window->win_bdb; const pag* page = bdb->bdb_buffer; PageSpace* pages = tdbb->getDatabase()->dbb_page_manager.findPageSpace(bdb->bdb_page.getPageSpaceID()); ERR_build_status(tdbb->tdbb_status_vector, Arg::Gds(isc_db_corrupt) << Arg::Str(pages->file->fil_string) << Arg::Gds(isc_page_type_err) << Arg::Gds(isc_badpagtyp) << Arg::Num(bdb->bdb_page.getPageNum()) << pagtype(type) << pagtype(page->pag_type)); // We should invalidate this bad buffer. CCH_unwind(tdbb, true); } #ifdef CACHE_READER static void prefetch_epilogue(Prefetch* prefetch, FbStatusVector* status_vector) { /************************************** * * p r e f e t c h _ e p i l o g u e * ************************************** * * Functional description * Stall on asynchronous I/O completion. * Move data from prefetch buffer to database * buffers, compute the checksum, and release * the latch. * **************************************/ if (!(prefetch->prf_flags & PRF_active)) return; thread_db* tdbb = prefetch->prf_tdbb; Database* dbb = tdbb->getDatabase(); prefetch->prf_piob.piob_wait = TRUE; const bool async_status = PIO_status(dbb, &prefetch->prf_piob, status_vector); // If there was an I/O error release all buffer latches acquired for the prefetch request. if (!async_status) { BufferDesc** next_bdb = prefetch->prf_bdbs; for (USHORT i = 0; i < prefetch->prf_max_prefetch; i++) { if (*next_bdb) release_bdb(tdbb, *next_bdb, true, false, false); next_bdb++; } prefetch->prf_flags &= ~PRF_active; return; } const SCHAR* next_buffer = prefetch->prf_io_buffer; BufferDesc** next_bdb = prefetch->prf_bdbs; for (USHORT i = 0; i < prefetch->prf_max_prefetch; i++) { if (*next_bdb) { pag* page = (*next_bdb)->bdb_buffer; if (next_buffer != reinterpret_cast(page)) memcpy(page, next_buffer, (ULONG) dbb->dbb_page_size); if (page->pag_pageno == (*next_bdb)->bdb_page.getPageNum()) { (*next_bdb)->bdb_flags &= ~(BDB_read_pending | BDB_not_valid); (*next_bdb)->bdb_flags |= BDB_prefetch; } release_bdb(tdbb, *next_bdb, true, false, false); } next_buffer += dbb->dbb_page_size; next_bdb++; } prefetch->prf_flags &= ~PRF_active; } static void prefetch_init(Prefetch* prefetch, thread_db* tdbb) { /************************************** * * p r e f e t c h _ i n i t * ************************************** * * Functional description * Initialize prefetch data structure. * Most systems that allow access to "raw" I/O * interfaces want the buffer address aligned. * **************************************/ Database* dbb = tdbb->getDatabase(); prefetch->prf_tdbb = tdbb; prefetch->prf_flags = 0; prefetch->prf_max_prefetch = PREFETCH_MAX_TRANSFER / dbb->dbb_page_size; prefetch->prf_aligned_buffer = (SCHAR*) (((U_IPTR) &prefetch->prf_unaligned_buffer + MIN_PAGE_SIZE - 1) & ~((U_IPTR) MIN_PAGE_SIZE - 1)); } static void prefetch_io(Prefetch* prefetch, FbStatusVector* status_vector) { /************************************** * * p r e f e t c h _ i o * ************************************** * * Functional description * Queue an asynchronous I/O to read * multiple pages into prefetch buffer. * **************************************/ thread_db* tdbb = prefetch->prf_tdbb; Database* dbb = tdbb->getDatabase(); if (!prefetch->prf_page_count) prefetch->prf_flags &= ~PRF_active; else { // Get the cache reader working on our behalf too if (!(dbb->dbb_bcb->bcb_flags & BCB_reader_active)) dbb->dbb_reader_sem.post(); const bool async_status = PIO_read_ahead(dbb, prefetch->prf_start_page, prefetch->prf_io_buffer, prefetch->prf_page_count, &prefetch->prf_piob, status_vector); if (!async_status) { BufferDesc** next_bdb = prefetch->prf_bdbs; for (USHORT i = 0; i < prefetch->prf_max_prefetch; i++) { if (*next_bdb) release_bdb(tdbb, *next_bdb, true, false, false); next_bdb++; } prefetch->prf_flags &= ~PRF_active; } } } static void prefetch_prologue(Prefetch* prefetch, SLONG* start_page) { /************************************** * * p r e f e t c h _ p r o l o g u e * ************************************** * * Functional description * Search for consecutive pages to be prefetched * and latch them for I/O. * **************************************/ thread_db* tdbb = prefetch->prf_tdbb; Database* dbb = tdbb->getDatabase(); BufferControl* bcb = dbb->dbb_bcb; prefetch->prf_start_page = *start_page; prefetch->prf_page_count = 0; prefetch->prf_flags |= PRF_active; BufferDesc** next_bdb = prefetch->prf_bdbs; for (USHORT i = 0; i < prefetch->prf_max_prefetch; i++) { *next_bdb = 0; if (SBM_clear(bcb->bcb_prefetch, *start_page) && (*next_bdb = get_buffer(tdbb, *start_page, LATCH_shared, 0))) { if ((*next_bdb)->bdb_flags & BDB_read_pending) prefetch->prf_page_count = i + 1; else { release_bdb(tdbb, *next_bdb, true, false, false); *next_bdb = 0; } } next_bdb++; (*start_page)++; } // Optimize non-sequential list prefetching to transfer directly to database buffers. BufferDesc* bdb; if (prefetch->prf_page_count == 1 && (bdb = prefetch->prf_bdbs[0])) prefetch->prf_io_buffer = reinterpret_cast(bdb->bdb_buffer); else prefetch->prf_io_buffer = prefetch->prf_aligned_buffer; // Reset starting page for next bitmap walk --(*start_page); } #endif // CACHE_READER static SSHORT related(BufferDesc* low, const BufferDesc* high, SSHORT limit, const ULONG mark) { /************************************** * * r e l a t e d * ************************************** * * Functional description * See if there are precedence relationships linking two buffers. * Since precedence graphs can become very complex, limit search for * precedence relationship by visiting a presribed limit of higher * precedence blocks. * **************************************/ const struct que* base = &low->bdb_higher; for (const struct que* que_inst = base->que_forward; que_inst != base; que_inst = que_inst->que_forward) { if (!--limit) return PRE_UNKNOWN; const Precedence* precedence = BLOCK(que_inst, Precedence, pre_higher); if (!(precedence->pre_flags & PRE_cleared)) { if (precedence->pre_hi->bdb_prec_walk_mark == mark) continue; if (precedence->pre_hi == high) return PRE_EXISTS; if (QUE_NOT_EMPTY(precedence->pre_hi->bdb_higher)) { limit = related(precedence->pre_hi, high, limit, mark); if (limit == PRE_EXISTS || limit == PRE_UNKNOWN) return limit; } else precedence->pre_hi->bdb_prec_walk_mark = mark; } } low->bdb_prec_walk_mark = mark; return limit; } static inline bool writeable(BufferDesc* bdb) { /************************************** * * w r i t e a b l e * ************************************** * * Functional description * See if a buffer is writeable. A buffer is writeable if * neither it nor any of it's higher precedence cousins are * marked for write. * This is the starting point of recursive walk of precedence * graph. The writeable_mark member is used to mark already seen * buffers to avoid repeated walk of the same sub-graph. * Currently this function can't be called from more than one * thread simultaneously. When SMP will be implemented we must * take additional care about thread-safety. * **************************************/ if (bdb->bdb_flags & BDB_marked) return false; if (QUE_EMPTY(bdb->bdb_higher)) return true; BufferControl* bcb = bdb->bdb_bcb; Sync syncPrec(&bcb->bcb_syncPrecedence, "writeable"); syncPrec.lock(SYNC_EXCLUSIVE); const ULONG mark = get_prec_walk_mark(bcb); return is_writeable(bdb, mark); } static bool is_writeable(BufferDesc* bdb, const ULONG mark) { /************************************** * * i s _ w r i t e a b l e * ************************************** * * Functional description * See if a buffer is writeable. A buffer is writeable if * neither it nor any of it's higher precedence cousins are * marked for write. * **************************************/ // If there are buffers that must be written first, check them, too. for (const que* queue = bdb->bdb_higher.que_forward; queue != &bdb->bdb_higher; queue = queue->que_forward) { const Precedence* precedence = BLOCK(queue, Precedence, pre_higher); if (!(precedence->pre_flags & PRE_cleared)) { BufferDesc* high = precedence->pre_hi; if (high->bdb_flags & BDB_marked) return false; if (high->bdb_prec_walk_mark != mark) { if (QUE_EMPTY(high->bdb_higher)) high->bdb_prec_walk_mark = mark; else if (!is_writeable(high, mark)) return false; } } } bdb->bdb_prec_walk_mark = mark; return true; } static int write_buffer(thread_db* tdbb, BufferDesc* bdb, const PageNumber page, const bool write_thru, FbStatusVector* const status, const bool write_this_page) { /************************************** * * w r i t e _ b u f f e r * ************************************** * * Functional description * Write a dirty buffer. This may recurse due to * precedence problems. * * input: write_this_page * = true if the input page needs to be written * before returning. (normal case) * = false if the input page is being written * because of precedence. Only write * one page and return so that the caller * can re-establish the need to write this * page. * * return: 0 = Write failed. * 1 = Page is written. Page was written by this * call, or was written by someone else, or the * cache buffer is already reassigned. * 2 = Only possible if write_this_page is false. * This input page is not written. One * page higher in precedence is written * though. Probable action: re-establich the * need to write this page and retry write. * **************************************/ SET_TDBB(tdbb); #ifdef SUPERSERVER_V2 Database* const dbb = tdbb->getDatabase(); #endif bdb->lockIO(tdbb); if (bdb->bdb_page != page) { bdb->unLockIO(tdbb); return 1; } if ((bdb->bdb_flags & BDB_marked) && !(bdb->bdb_flags & BDB_faked)) BUGCHECK(217); // msg 217 buffer marked for update if (!(bdb->bdb_flags & BDB_dirty) && !(write_thru && bdb->bdb_flags & BDB_db_dirty)) { bdb->unLockIO(tdbb); clear_precedence(tdbb, bdb); return 1; } // If there are buffers that must be written first, write them now. BufferControl *bcb = bdb->bdb_bcb; if (QUE_NOT_EMPTY(bdb->bdb_higher)) { Sync syncPrec(&bcb->bcb_syncPrecedence, "write_buffer"); while (true) { syncPrec.lock(SYNC_EXCLUSIVE); if (QUE_EMPTY(bdb->bdb_higher)) { syncPrec.unlock(); break; } QUE que_inst = bdb->bdb_higher.que_forward; Precedence* precedence = BLOCK(que_inst, Precedence, pre_higher); if (precedence->pre_flags & PRE_cleared) { QUE_DELETE(precedence->pre_higher); QUE_DELETE(precedence->pre_lower); precedence->pre_hi = (BufferDesc*) bcb->bcb_free; bcb->bcb_free = precedence; syncPrec.unlock(); } else { bdb->unLockIO(tdbb); BufferDesc* hi_bdb = precedence->pre_hi; const PageNumber hi_page = hi_bdb->bdb_page; int write_status = 0; syncPrec.unlock(); write_status = write_buffer(tdbb, hi_bdb, hi_page, write_thru, status, false); if (write_status == 0) return 0; // return IO error if (!write_this_page) return 2; // caller wants to re-establish the need for this write after one precedence write bdb->lockIO(tdbb); if (bdb->bdb_page != page) { bdb->unLockIO(tdbb); return 1; } } } } #ifdef SUPERSERVER_V2 // Header page I/O is deferred until a dirty page, which was modified by a // transaction not updated on the header page, needs to be written. Note // that the header page needs to be written with the target page latched to // prevent younger transactions from modifying the target page. if (page != HEADER_PAGE_NUMBER && bdb->bdb_mark_transaction > dbb->dbb_last_header_write) { TRA_header_write(tdbb, dbb, bdb->bdb_mark_transaction); } #endif // Unless the buffer has been faked (recently re-allocated), write out the page bool result = true; if ((bdb->bdb_flags & BDB_dirty || (write_thru && bdb->bdb_flags & BDB_db_dirty)) && !(bdb->bdb_flags & BDB_marked)) { result = write_page(tdbb, bdb, status, false); } bdb->unLockIO(tdbb); if (result) clear_precedence(tdbb, bdb); if (!result) return 0; if (!write_this_page) return 2; return 1; } static bool write_page(thread_db* tdbb, BufferDesc* bdb, FbStatusVector* const status, const bool inAst) { /************************************** * * w r i t e _ p a g e * ************************************** * * Functional description * Do actions required when writing a database page, * including journaling, shadowing. * **************************************/ CCH_TRACE(("WRITE %d:%06d", bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum())); // hvlad: why it is needed in Vulcan ??? //Sync syncWrite(&bcb->bcb_syncPageWrite, "write_page"); //syncWrite.lock(SYNC_EXCLUSIVE); if (bdb->bdb_flags & BDB_not_valid) { ERR_build_status(status, Arg::Gds(isc_buf_invalid) << Arg::Num(bdb->bdb_page.getPageNum())); return false; } Database* const dbb = tdbb->getDatabase(); pag* const page = bdb->bdb_buffer; // Before writing db header page, make sure that // the next_transaction > oldest_active transaction if (bdb->bdb_page == HEADER_PAGE_NUMBER) { const header_page* const header = (header_page*) page; const TraNumber next_transaction = Ods::getNT(header); const TraNumber oldest_active = Ods::getOAT(header); const TraNumber oldest_transaction = Ods::getOIT(header); if (next_transaction) { if (oldest_active > next_transaction) BUGCHECK(266); // next transaction older than oldest active if (oldest_transaction > next_transaction) BUGCHECK(267); // next transaction older than oldest transaction } } page->pag_generation++; bool result = true; //if (!dbb->dbb_wal || write_thru) becomes //if (true || write_thru) then finally if (true) // I won't wipe out the if() itself to allow my changes be verified easily by others if (true) { tdbb->bumpStats(RuntimeStatistics::PAGE_WRITES); // write out page to main database file, and to any // shadows, making a special case of the header page BackupManager* bm = dbb->dbb_backup_manager; const int backup_state = bm->getState(); /// ASF: Always true: if (bdb->bdb_page.getPageNum() >= 0) { fb_assert(backup_state != Ods::hdr_nbak_unknown); page->pag_pageno = bdb->bdb_page.getPageNum(); #ifdef NBAK_DEBUG // We cannot call normal trace functions here as they are signal-unsafe // "Write page=%d, dir=%d, diff=%d, scn=%d" char buffer[1000], *ptr = buffer; strcpy(ptr, "NBAK, Write page "); ptr += strlen(ptr); gds__ulstr(ptr, bdb->bdb_page.getPageNum(), 0, 0); ptr += strlen(ptr); strcpy(ptr, ", backup_state="); ptr += strlen(ptr); gds__ulstr(ptr, backup_state, 0, 0); ptr += strlen(ptr); strcpy(ptr, ", diff="); ptr += strlen(ptr); gds__ulstr(ptr, bdb->bdb_difference_page, 0, 0); ptr += strlen(ptr); strcpy(ptr, ", scn="); ptr += strlen(ptr); gds__ulstr(ptr, bdb->bdb_buffer->pag_scn, 0, 0); ptr += strlen(ptr); gds__trace(buffer); #endif PageSpace* pageSpace = dbb->dbb_page_manager.findPageSpace(bdb->bdb_page.getPageSpaceID()); fb_assert(pageSpace); const bool isTempPage = pageSpace->isTemporary(); if (!isTempPage && (backup_state == Ods::hdr_nbak_stalled || (backup_state == Ods::hdr_nbak_merge && bdb->bdb_difference_page))) { if (!dbb->dbb_backup_manager->writeDifference(tdbb, status, bdb->bdb_difference_page, bdb->bdb_buffer)) { bdb->bdb_flags |= BDB_io_error; dbb->dbb_flags |= DBB_suspend_bgio; return false; } } if (!isTempPage && backup_state == Ods::hdr_nbak_stalled) { // We finished. Adjust transaction accounting and get ready for exit if (bdb->bdb_page == HEADER_PAGE_NUMBER) dbb->dbb_last_header_write = Ods::getNT((header_page*) page); } else { // We need to write our pages to main database files class Pio : public CryptoManager::IOCallback { public: Pio(jrd_file* f, BufferDesc* b, bool ast, bool tp, PageSpace* ps) : file(f), bdb(b), inAst(ast), isTempPage(tp), pageSpace(ps) { } bool callback(thread_db* tdbb, FbStatusVector* status, Ods::pag* page) { Database* dbb = tdbb->getDatabase(); while (!PIO_write(tdbb, file, bdb, page, status)) { if (isTempPage || !CCH_rollover_to_shadow(tdbb, dbb, file, inAst)) { bdb->bdb_flags |= BDB_io_error; dbb->dbb_flags |= DBB_suspend_bgio; return false; } file = pageSpace->file; } if (bdb->bdb_page == HEADER_PAGE_NUMBER) dbb->dbb_last_header_write = Ods::getNT((header_page*) page); if (dbb->dbb_shadow && !isTempPage) return CCH_write_all_shadows(tdbb, 0, bdb, page, status, inAst); return true; } private: jrd_file* file; BufferDesc* bdb; bool inAst; bool isTempPage; PageSpace* pageSpace; }; Pio io(pageSpace->file, bdb, inAst, isTempPage, pageSpace); result = dbb->dbb_crypto_manager->write(tdbb, status, page, &io); if (!result && (bdb->bdb_flags & BDB_io_error)) { return false; } } } if (result) bdb->bdb_flags &= ~BDB_db_dirty; } if (!result) { // If there was a write error then idle background threads // so that they don't spin trying to write these pages. This // usually results from device full errors which can be fixed // by someone freeing disk space. bdb->bdb_flags |= BDB_io_error; dbb->dbb_flags |= DBB_suspend_bgio; } else { // clear the dirty bit vector, since the buffer is now // clean regardless of which transactions have modified it // Destination difference page number is only valid between MARK and // write_page so clean it now to avoid confusion bdb->bdb_difference_page = 0; bdb->bdb_transactions = 0; bdb->bdb_mark_transaction = 0; if (!(bdb->bdb_bcb->bcb_flags & BCB_keep_pages)) removeDirty(bdb->bdb_bcb, bdb); bdb->bdb_flags &= ~(BDB_must_write | BDB_system_dirty); clear_dirty_flag_and_nbak_state(tdbb, bdb); if (bdb->bdb_flags & BDB_io_error) { // If a write error has cleared, signal background threads // to resume their regular duties. If someone has freed up // disk space these errors will spontaneously go away. bdb->bdb_flags &= ~BDB_io_error; dbb->dbb_flags &= ~DBB_suspend_bgio; } } return result; } static void clear_dirty_flag_and_nbak_state(thread_db* tdbb, BufferDesc* bdb) { const AtomicCounter::counter_type oldFlags = bdb->bdb_flags.exchangeBitAnd( ~(BDB_dirty | BDB_nbak_state_lock)); if (oldFlags & BDB_nbak_state_lock) { NBAK_TRACE(("unlock state for dirty page %d:%06d", bdb->bdb_page.getPageSpaceID(), bdb->bdb_page.getPageNum())); tdbb->getDatabase()->dbb_backup_manager->unlockStateRead(tdbb); } else if ((oldFlags & BDB_dirty) && bdb->bdb_page != HEADER_PAGE_NUMBER) fb_assert(PageSpace::isTemporary(bdb->bdb_page.getPageSpaceID())); } void recentlyUsed(BufferDesc* bdb) { const AtomicCounter::counter_type oldFlags = bdb->bdb_flags.exchangeBitOr(BDB_lru_chained); if (oldFlags & BDB_lru_chained) return; BufferControl* bcb = bdb->bdb_bcb; #ifdef DEV_BUILD volatile BufferDesc* chain = bcb->bcb_lru_chain; for (; chain; chain = chain->bdb_lru_chain) { if (chain == bdb) BUGCHECK(-1); // !! } #endif for (;;) { bdb->bdb_lru_chain = bcb->bcb_lru_chain; if (bcb->bcb_lru_chain.compare_exchange_strong(bdb->bdb_lru_chain, bdb)) break; } } void requeueRecentlyUsed(BufferControl* bcb) { BufferDesc* chain = NULL; // Let's pick up the LRU pending chain, if any for (;;) { chain = bcb->bcb_lru_chain; if (bcb->bcb_lru_chain.compare_exchange_strong(chain, NULL)) break; } if (!chain) return; // Next, let's flip the order BufferDesc* reversed = NULL; BufferDesc* bdb; while ((bdb = chain) != NULL) { chain = bdb->bdb_lru_chain; bdb->bdb_lru_chain = reversed; reversed = bdb; } while ((bdb = reversed) != NULL) { reversed = bdb->bdb_lru_chain; QUE_DELETE(bdb->bdb_in_use); QUE_INSERT(bcb->bcb_in_use, bdb->bdb_in_use); bdb->bdb_flags &= ~BDB_lru_chained; bdb->bdb_lru_chain = NULL; } chain = bcb->bcb_lru_chain; } BufferControl* BufferControl::create(Database* dbb) { MemoryPool* const pool = dbb->createPool(); BufferControl* const bcb = FB_NEW_POOL(*pool) BufferControl(*pool, dbb->dbb_memory_stats); pool->setStatsGroup(bcb->bcb_memory_stats); return bcb; } void BufferControl::destroy(BufferControl* bcb) { Database* const dbb = bcb->bcb_database; MemoryPool* const pool = bcb->bcb_bufferpool; MemoryStats temp_stats; pool->setStatsGroup(temp_stats); delete bcb; dbb->deletePool(pool); } bool BufferDesc::addRef(thread_db* tdbb, SyncType syncType, int wait) { if (wait == 1) bdb_syncPage.lock(NULL, syncType, FB_FUNCTION); else if (!bdb_syncPage.lock(NULL, syncType, FB_FUNCTION, -wait * 1000)) return false; ++bdb_use_count; if (syncType == SYNC_EXCLUSIVE) { bdb_exclusive = tdbb; ++bdb_writers; } tdbb->registerBdb(this); return true; } bool BufferDesc::addRefConditional(thread_db* tdbb, SyncType syncType) { if (!bdb_syncPage.lockConditional(syncType, FB_FUNCTION)) return false; ++bdb_use_count; if (syncType == SYNC_EXCLUSIVE) { bdb_exclusive = tdbb; ++bdb_writers; } tdbb->registerBdb(this); return true; } void BufferDesc::downgrade(SyncType syncType) { if (syncType == SYNC_SHARED && !bdb_writers) return; if (bdb_writers != 1) BUGCHECK(296); // inconsistent latch downgrade call --bdb_writers; bdb_exclusive = NULL; bdb_syncPage.downgrade(syncType); } void BufferDesc::release(thread_db* tdbb, bool repost) { //const SyncType oldState = bdb_syncPage.getState(); Unfinished logic here??? fb_assert(!(bdb_flags & BDB_marked) || bdb_writers > 1); if (!tdbb->clearBdb(this)) return; --bdb_use_count; if (bdb_writers) { if (--bdb_writers == 0) bdb_exclusive = NULL; bdb_syncPage.unlock(NULL, SYNC_EXCLUSIVE); } else bdb_syncPage.unlock(NULL, SYNC_SHARED); if (repost && !isLocked() && (bdb_ast_flags & BDB_blocking)) { PAGE_LOCK_RE_POST(tdbb, bdb_bcb, bdb_lock); } } void BufferDesc::lockIO(thread_db* tdbb) { bdb_syncIO.lock(NULL, SYNC_EXCLUSIVE, FB_FUNCTION); fb_assert(!bdb_io_locks && bdb_io != tdbb || bdb_io_locks && bdb_io == tdbb); bdb_io = tdbb; bdb_io->registerBdb(this); ++bdb_io_locks; ++bdb_use_count; } void BufferDesc::unLockIO(thread_db* tdbb) { fb_assert(bdb_io && bdb_io == tdbb); fb_assert(bdb_io_locks > 0); if (!bdb_io->clearBdb(this)) return; --bdb_use_count; if (--bdb_io_locks == 0) bdb_io = NULL; bdb_syncIO.unlock(NULL, SYNC_EXCLUSIVE); }