001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.kahadb.page;
018
019 import java.io.ByteArrayInputStream;
020 import java.io.ByteArrayOutputStream;
021 import java.io.DataInputStream;
022 import java.io.DataOutputStream;
023 import java.io.File;
024 import java.io.FileInputStream;
025 import java.io.FileOutputStream;
026 import java.io.IOException;
027 import java.io.InterruptedIOException;
028 import java.io.RandomAccessFile;
029 import java.util.*;
030 import java.util.Map.Entry;
031 import java.util.concurrent.CountDownLatch;
032 import java.util.concurrent.atomic.AtomicBoolean;
033 import java.util.concurrent.atomic.AtomicLong;
034 import java.util.zip.Adler32;
035 import java.util.zip.Checksum;
036
037 import org.apache.commons.logging.Log;
038 import org.apache.commons.logging.LogFactory;
039 import org.apache.kahadb.util.DataByteArrayOutputStream;
040 import org.apache.kahadb.util.IOExceptionSupport;
041 import org.apache.kahadb.util.IOHelper;
042 import org.apache.kahadb.util.IntrospectionSupport;
043 import org.apache.kahadb.util.LRUCache;
044 import org.apache.kahadb.util.Sequence;
045 import org.apache.kahadb.util.SequenceSet;
046
047 /**
048 * A PageFile provides you random access to fixed sized disk pages. This object is not thread safe and therefore access to it should
049 * be externally synchronized.
050 *
051 * The file has 3 parts:
052 * Metadata Space: 4k : Reserved metadata area. Used to store persistent config about the file.
053 * Recovery Buffer Space: Page Size * 1000 : This is a redo log used to prevent partial page writes from making the file inconsistent
054 * Page Space: The pages in the page file.
055 *
056 * @version $Revision: 1038566 $
057 */
058 public class PageFile {
059
060 private static final String PAGEFILE_SUFFIX = ".data";
061 private static final String RECOVERY_FILE_SUFFIX = ".redo";
062 private static final String FREE_FILE_SUFFIX = ".free";
063
064 // 4k Default page size.
065 public static final int DEFAULT_PAGE_SIZE = Integer.parseInt(System.getProperty("defaultPageSize", ""+1024*4));
066 public static final int DEFAULT_WRITE_BATCH_SIZE = Integer.parseInt(System.getProperty("defaultWriteBatchSize", ""+1000));
067 private static final int RECOVERY_FILE_HEADER_SIZE=1024*4;
068 private static final int PAGE_FILE_HEADER_SIZE=1024*4;
069
070 // Recovery header is (long offset)
071 private static final Log LOG = LogFactory.getLog(PageFile.class);
072
073 // A PageFile will use a couple of files in this directory
074 private File directory;
075 // And the file names in that directory will be based on this name.
076 private final String name;
077
078 // File handle used for reading pages..
079 private RandomAccessFile readFile;
080 // File handle used for writing pages..
081 private RandomAccessFile writeFile;
082 // File handle used for writing pages..
083 private RandomAccessFile recoveryFile;
084
085 // The size of pages
086 private int pageSize = DEFAULT_PAGE_SIZE;
087
088 // The minimum number of space allocated to the recovery file in number of pages.
089 private int recoveryFileMinPageCount = 1000;
090 // The max size that we let the recovery file grow to.. ma exceed the max, but the file will get resize
091 // to this max size as soon as possible.
092 private int recoveryFileMaxPageCount = 10000;
093 // The number of pages in the current recovery buffer
094 private int recoveryPageCount;
095
096 private AtomicBoolean loaded = new AtomicBoolean();
097 // The number of pages we are aiming to write every time we
098 // write to disk.
099 int writeBatchSize = DEFAULT_WRITE_BATCH_SIZE;
100
101 // We keep a cache of pages recently used?
102 private Map<Long, Page> pageCache;
103 // The cache of recently used pages.
104 private boolean enablePageCaching=true;
105 // How many pages will we keep in the cache?
106 private int pageCacheSize = 100;
107
108 // Should first log the page write to the recovery buffer? Avoids partial
109 // page write failures..
110 private boolean enableRecoveryFile=true;
111 // Will we sync writes to disk. Ensures that data will not be lost after a checkpoint()
112 private boolean enableDiskSyncs=true;
113 // Will writes be done in an async thread?
114 private boolean enabledWriteThread=false;
115
116 // These are used if enableAsyncWrites==true
117 private AtomicBoolean stopWriter = new AtomicBoolean();
118 private Thread writerThread;
119 private CountDownLatch checkpointLatch;
120
121 // Keeps track of writes that are being written to disk.
122 private TreeMap<Long, PageWrite> writes=new TreeMap<Long, PageWrite>();
123
124 // Keeps track of free pages.
125 private final AtomicLong nextFreePageId = new AtomicLong();
126 private SequenceSet freeList = new SequenceSet();
127
128 private AtomicLong nextTxid = new AtomicLong();
129
130 // Persistent settings stored in the page file.
131 private MetaData metaData;
132
133 /**
134 * Use to keep track of updated pages which have not yet been committed.
135 */
136 static class PageWrite {
137 Page page;
138 byte[] current;
139 byte[] diskBound;
140
141 public PageWrite(Page page, byte[] data) {
142 this.page=page;
143 current=data;
144 }
145
146 public void setCurrent(Page page, byte[] data) {
147 this.page=page;
148 current=data;
149 }
150
151 @Override
152 public String toString() {
153 return "[PageWrite:"+page.getPageId()+"]";
154 }
155
156 @SuppressWarnings("unchecked")
157 public Page getPage() {
158 return page;
159 }
160
161 void begin() {
162 diskBound = current;
163 current = null;
164 }
165
166 /**
167 * @return true if there is no pending writes to do.
168 */
169 boolean done() {
170 diskBound=null;
171 return current == null;
172 }
173
174 boolean isDone() {
175 return diskBound == null && current == null;
176 }
177
178 }
179
180 /**
181 * The MetaData object hold the persistent data associated with a PageFile object.
182 */
183 public static class MetaData {
184
185 String fileType;
186 String fileTypeVersion;
187
188 long metaDataTxId=-1;
189 int pageSize;
190 boolean cleanShutdown;
191 long lastTxId;
192 long freePages;
193
194 public String getFileType() {
195 return fileType;
196 }
197 public void setFileType(String fileType) {
198 this.fileType = fileType;
199 }
200 public String getFileTypeVersion() {
201 return fileTypeVersion;
202 }
203 public void setFileTypeVersion(String version) {
204 this.fileTypeVersion = version;
205 }
206 public long getMetaDataTxId() {
207 return metaDataTxId;
208 }
209 public void setMetaDataTxId(long metaDataTxId) {
210 this.metaDataTxId = metaDataTxId;
211 }
212 public int getPageSize() {
213 return pageSize;
214 }
215 public void setPageSize(int pageSize) {
216 this.pageSize = pageSize;
217 }
218 public boolean isCleanShutdown() {
219 return cleanShutdown;
220 }
221 public void setCleanShutdown(boolean cleanShutdown) {
222 this.cleanShutdown = cleanShutdown;
223 }
224 public long getLastTxId() {
225 return lastTxId;
226 }
227 public void setLastTxId(long lastTxId) {
228 this.lastTxId = lastTxId;
229 }
230 public long getFreePages() {
231 return freePages;
232 }
233 public void setFreePages(long value) {
234 this.freePages = value;
235 }
236 }
237
238 public Transaction tx() {
239 assertLoaded();
240 return new Transaction(this);
241 }
242
243 /**
244 * Creates a PageFile in the specified directory who's data files are named by name.
245 *
246 * @param directory
247 * @param name
248 */
249 public PageFile(File directory, String name) {
250 this.directory = directory;
251 this.name = name;
252 }
253
254 /**
255 * Deletes the files used by the PageFile object. This method can only be used when this object is not loaded.
256 *
257 * @throws IOException
258 * if the files cannot be deleted.
259 * @throws IllegalStateException
260 * if this PageFile is loaded
261 */
262 public void delete() throws IOException {
263 if( loaded.get() ) {
264 throw new IllegalStateException("Cannot delete page file data when the page file is loaded");
265 }
266 delete(getMainPageFile());
267 delete(getFreeFile());
268 delete(getRecoveryFile());
269 }
270
271 /**
272 * @param file
273 * @throws IOException
274 */
275 private void delete(File file) throws IOException {
276 if( file.exists() ) {
277 if( !file.delete() ) {
278 throw new IOException("Could not delete: "+file.getPath());
279 }
280 }
281 }
282
283 /**
284 * Loads the page file so that it can be accessed for read/write purposes. This allocates OS resources. If this is the
285 * first time the page file is loaded, then this creates the page file in the file system.
286 *
287 * @throws IOException
288 * If the page file cannot be loaded. This could be cause the existing page file is corrupt is a bad version or if
289 * there was a disk error.
290 * @throws IllegalStateException
291 * If the page file was already loaded.
292 */
293 public void load() throws IOException, IllegalStateException {
294 if (loaded.compareAndSet(false, true)) {
295
296 if( enablePageCaching ) {
297 pageCache = Collections.synchronizedMap(new LRUCache<Long, Page>(pageCacheSize, pageCacheSize, 0.75f, true));
298 }
299
300 File file = getMainPageFile();
301 IOHelper.mkdirs(file.getParentFile());
302 writeFile = new RandomAccessFile(file, "rw");
303 readFile = new RandomAccessFile(file, "r");
304
305 if (readFile.length() > 0) {
306 // Load the page size setting cause that can't change once the file is created.
307 loadMetaData();
308 pageSize = metaData.getPageSize();
309 } else {
310 // Store the page size setting cause that can't change once the file is created.
311 metaData = new MetaData();
312 metaData.setFileType(PageFile.class.getName());
313 metaData.setFileTypeVersion("1");
314 metaData.setPageSize(getPageSize());
315 metaData.setCleanShutdown(true);
316 metaData.setFreePages(-1);
317 metaData.setLastTxId(0);
318 storeMetaData();
319 }
320
321 if( enableRecoveryFile ) {
322 recoveryFile = new RandomAccessFile(getRecoveryFile(), "rw");
323 }
324
325 if( metaData.isCleanShutdown() ) {
326 nextTxid.set(metaData.getLastTxId()+1);
327 if( metaData.getFreePages()>0 ) {
328 loadFreeList();
329 }
330 } else {
331 LOG.debug(toString() + ", Recovering page file...");
332 nextTxid.set(redoRecoveryUpdates());
333
334 // Scan all to find the free pages.
335 freeList = new SequenceSet();
336 for (Iterator i = tx().iterator(true); i.hasNext();) {
337 Page page = (Page)i.next();
338 if( page.getType() == Page.PAGE_FREE_TYPE ) {
339 freeList.add(page.getPageId());
340 }
341 }
342
343 }
344
345 metaData.setCleanShutdown(false);
346 storeMetaData();
347 getFreeFile().delete();
348
349 if( writeFile.length() < PAGE_FILE_HEADER_SIZE) {
350 writeFile.setLength(PAGE_FILE_HEADER_SIZE);
351 }
352 nextFreePageId.set((writeFile.length()-PAGE_FILE_HEADER_SIZE)/pageSize);
353 startWriter();
354
355 } else {
356 throw new IllegalStateException("Cannot load the page file when it is allready loaded.");
357 }
358 }
359
360
361 /**
362 * Unloads a previously loaded PageFile. This deallocates OS related resources like file handles.
363 * once unloaded, you can no longer use the page file to read or write Pages.
364 *
365 * @throws IOException
366 * if there was a disk error occurred while closing the down the page file.
367 * @throws IllegalStateException
368 * if the PageFile is not loaded
369 */
370 public void unload() throws IOException {
371 if (loaded.compareAndSet(true, false)) {
372 flush();
373 try {
374 stopWriter();
375 } catch (InterruptedException e) {
376 throw new InterruptedIOException();
377 }
378
379 if( freeList.isEmpty() ) {
380 metaData.setFreePages(0);
381 } else {
382 storeFreeList();
383 metaData.setFreePages(freeList.size());
384 }
385
386 metaData.setLastTxId( nextTxid.get()-1 );
387 metaData.setCleanShutdown(true);
388 storeMetaData();
389
390 if (readFile != null) {
391 readFile.close();
392 readFile = null;
393 writeFile.close();
394 writeFile=null;
395 if( enableRecoveryFile ) {
396 recoveryFile.close();
397 recoveryFile=null;
398 }
399 freeList.clear();
400 if( pageCache!=null ) {
401 pageCache=null;
402 }
403 synchronized(writes) {
404 writes.clear();
405 }
406 }
407 } else {
408 throw new IllegalStateException("Cannot unload the page file when it is not loaded");
409 }
410 }
411
412 public boolean isLoaded() {
413 return loaded.get();
414 }
415
416 /**
417 * Flush and sync all write buffers to disk.
418 *
419 * @throws IOException
420 * If an disk error occurred.
421 */
422 public void flush() throws IOException {
423
424 if( enabledWriteThread && stopWriter.get() ) {
425 throw new IOException("Page file already stopped: checkpointing is not allowed");
426 }
427
428 // Setup a latch that gets notified when all buffered writes hits the disk.
429 CountDownLatch checkpointLatch;
430 synchronized( writes ) {
431 if( writes.isEmpty()) {
432 return;
433 }
434 if( enabledWriteThread ) {
435 if( this.checkpointLatch == null ) {
436 this.checkpointLatch = new CountDownLatch(1);
437 }
438 checkpointLatch = this.checkpointLatch;
439 writes.notify();
440 } else {
441 writeBatch();
442 return;
443 }
444 }
445 try {
446 int size = writes.size();
447 long start = System.currentTimeMillis();
448 checkpointLatch.await();
449 long end = System.currentTimeMillis();
450 if( end-start > 100 ) {
451 LOG.warn("KahaDB PageFile flush: " + size + " queued writes, latch wait took "+(end-start));
452 }
453 } catch (InterruptedException e) {
454 throw new InterruptedIOException();
455 }
456 }
457
458
459 public String toString() {
460 return "Page File: "+getMainPageFile();
461 }
462
463 ///////////////////////////////////////////////////////////////////
464 // Private Implementation Methods
465 ///////////////////////////////////////////////////////////////////
466 private File getMainPageFile() {
467 return new File(directory, IOHelper.toFileSystemSafeName(name)+PAGEFILE_SUFFIX);
468 }
469
470 public File getFreeFile() {
471 return new File(directory, IOHelper.toFileSystemSafeName(name)+FREE_FILE_SUFFIX);
472 }
473
474 public File getRecoveryFile() {
475 return new File(directory, IOHelper.toFileSystemSafeName(name)+RECOVERY_FILE_SUFFIX);
476 }
477
478 private long toOffset(long pageId) {
479 return PAGE_FILE_HEADER_SIZE+(pageId*pageSize);
480 }
481
482 private void loadMetaData() throws IOException {
483
484 ByteArrayInputStream is;
485 MetaData v1 = new MetaData();
486 MetaData v2 = new MetaData();
487 try {
488 Properties p = new Properties();
489 byte[] d = new byte[PAGE_FILE_HEADER_SIZE/2];
490 readFile.seek(0);
491 readFile.readFully(d);
492 is = new ByteArrayInputStream(d);
493 p.load(is);
494 IntrospectionSupport.setProperties(v1, p);
495 } catch (IOException e) {
496 v1 = null;
497 }
498
499 try {
500 Properties p = new Properties();
501 byte[] d = new byte[PAGE_FILE_HEADER_SIZE/2];
502 readFile.seek(PAGE_FILE_HEADER_SIZE/2);
503 readFile.readFully(d);
504 is = new ByteArrayInputStream(d);
505 p.load(is);
506 IntrospectionSupport.setProperties(v2, p);
507 } catch (IOException e) {
508 v2 = null;
509 }
510
511 if( v1==null && v2==null ) {
512 throw new IOException("Could not load page file meta data");
513 }
514
515 if( v1 == null || v1.metaDataTxId<0 ) {
516 metaData = v2;
517 } else if( v2==null || v1.metaDataTxId<0 ) {
518 metaData = v1;
519 } else if( v1.metaDataTxId==v2.metaDataTxId ) {
520 metaData = v1; // use the first since the 2nd could be a partial..
521 } else {
522 metaData = v2; // use the second cause the first is probably a partial.
523 }
524 }
525
526 private void storeMetaData() throws IOException {
527 // Convert the metadata into a property format
528 metaData.metaDataTxId++;
529 Properties p = new Properties();
530 IntrospectionSupport.getProperties(metaData, p, null);
531
532 ByteArrayOutputStream os = new ByteArrayOutputStream(PAGE_FILE_HEADER_SIZE);
533 p.store(os, "");
534 if( os.size() > PAGE_FILE_HEADER_SIZE/2) {
535 throw new IOException("Configuation is to larger than: "+PAGE_FILE_HEADER_SIZE/2);
536 }
537 // Fill the rest with space...
538 byte[] filler = new byte[(PAGE_FILE_HEADER_SIZE/2)-os.size()];
539 Arrays.fill(filler, (byte)' ');
540 os.write(filler);
541 os.flush();
542
543 byte[] d = os.toByteArray();
544
545 // So we don't loose it.. write it 2 times...
546 writeFile.seek(0);
547 writeFile.write(d);
548 writeFile.getFD().sync();
549 writeFile.seek(PAGE_FILE_HEADER_SIZE/2);
550 writeFile.write(d);
551 writeFile.getFD().sync();
552 }
553
554 private void storeFreeList() throws IOException {
555 FileOutputStream os = new FileOutputStream(getFreeFile());
556 DataOutputStream dos = new DataOutputStream(os);
557 SequenceSet.Marshaller.INSTANCE.writePayload(freeList, dos);
558 dos.close();
559 }
560
561 private void loadFreeList() throws IOException {
562 freeList.clear();
563 FileInputStream is = new FileInputStream(getFreeFile());
564 DataInputStream dis = new DataInputStream(is);
565 freeList = SequenceSet.Marshaller.INSTANCE.readPayload(dis);
566 dis.close();
567 }
568
569 ///////////////////////////////////////////////////////////////////
570 // Property Accessors
571 ///////////////////////////////////////////////////////////////////
572
573 /**
574 * Is the recovery buffer used to double buffer page writes. Enabled by default.
575 *
576 * @return is the recovery buffer enabled.
577 */
578 public boolean isEnableRecoveryFile() {
579 return enableRecoveryFile;
580 }
581
582 /**
583 * Sets if the recovery buffer uses to double buffer page writes. Enabled by default. Disabling this
584 * may potentially cause partial page writes which can lead to page file corruption.
585 */
586 public void setEnableRecoveryFile(boolean doubleBuffer) {
587 assertNotLoaded();
588 this.enableRecoveryFile = doubleBuffer;
589 }
590
591 /**
592 * @return Are page writes synced to disk?
593 */
594 public boolean isEnableDiskSyncs() {
595 return enableDiskSyncs;
596 }
597
598 /**
599 * Allows you enable syncing writes to disk.
600 * @param syncWrites
601 */
602 public void setEnableDiskSyncs(boolean syncWrites) {
603 assertNotLoaded();
604 this.enableDiskSyncs = syncWrites;
605 }
606
607 /**
608 * @return the page size
609 */
610 public int getPageSize() {
611 return this.pageSize;
612 }
613
614 /**
615 * @return the amount of content data that a page can hold.
616 */
617 public int getPageContentSize() {
618 return this.pageSize-Page.PAGE_HEADER_SIZE;
619 }
620
621 /**
622 * Configures the page size used by the page file. By default it is 4k. Once a page file is created on disk,
623 * subsequent loads of that file will use the original pageSize. Once the PageFile is loaded, this setting
624 * can no longer be changed.
625 *
626 * @param pageSize the pageSize to set
627 * @throws IllegalStateException
628 * once the page file is loaded.
629 */
630 public void setPageSize(int pageSize) throws IllegalStateException {
631 assertNotLoaded();
632 this.pageSize = pageSize;
633 }
634
635 /**
636 * @return true if read page caching is enabled
637 */
638 public boolean isEnablePageCaching() {
639 return this.enablePageCaching;
640 }
641
642 /**
643 * @param allows you to enable read page caching
644 */
645 public void setEnablePageCaching(boolean enablePageCaching) {
646 assertNotLoaded();
647 this.enablePageCaching = enablePageCaching;
648 }
649
650 /**
651 * @return the maximum number of pages that will get stored in the read page cache.
652 */
653 public int getPageCacheSize() {
654 return this.pageCacheSize;
655 }
656
657 /**
658 * @param Sets the maximum number of pages that will get stored in the read page cache.
659 */
660 public void setPageCacheSize(int pageCacheSize) {
661 assertNotLoaded();
662 this.pageCacheSize = pageCacheSize;
663 }
664
665 public boolean isEnabledWriteThread() {
666 return enabledWriteThread;
667 }
668
669 public void setEnableWriteThread(boolean enableAsyncWrites) {
670 assertNotLoaded();
671 this.enabledWriteThread = enableAsyncWrites;
672 }
673
674 public long getDiskSize() throws IOException {
675 return toOffset(nextFreePageId.get());
676 }
677
678 /**
679 * @return the number of pages allocated in the PageFile
680 */
681 public long getPageCount() {
682 return nextFreePageId.get();
683 }
684
685 public int getRecoveryFileMinPageCount() {
686 return recoveryFileMinPageCount;
687 }
688
689 public void setRecoveryFileMinPageCount(int recoveryFileMinPageCount) {
690 assertNotLoaded();
691 this.recoveryFileMinPageCount = recoveryFileMinPageCount;
692 }
693
694 public int getRecoveryFileMaxPageCount() {
695 return recoveryFileMaxPageCount;
696 }
697
698 public void setRecoveryFileMaxPageCount(int recoveryFileMaxPageCount) {
699 assertNotLoaded();
700 this.recoveryFileMaxPageCount = recoveryFileMaxPageCount;
701 }
702
703 public int getWriteBatchSize() {
704 return writeBatchSize;
705 }
706
707 public void setWriteBatchSize(int writeBatchSize) {
708 assertNotLoaded();
709 this.writeBatchSize = writeBatchSize;
710 }
711
712 ///////////////////////////////////////////////////////////////////
713 // Package Protected Methods exposed to Transaction
714 ///////////////////////////////////////////////////////////////////
715
716 /**
717 * @throws IllegalStateException if the page file is not loaded.
718 */
719 void assertLoaded() throws IllegalStateException {
720 if( !loaded.get() ) {
721 throw new IllegalStateException("PageFile is not loaded");
722 }
723 }
724 void assertNotLoaded() throws IllegalStateException {
725 if( loaded.get() ) {
726 throw new IllegalStateException("PageFile is loaded");
727 }
728 }
729
730 /**
731 * Allocates a block of free pages that you can write data to.
732 *
733 * @param count the number of sequential pages to allocate
734 * @return the first page of the sequential set.
735 * @throws IOException
736 * If an disk error occurred.
737 * @throws IllegalStateException
738 * if the PageFile is not loaded
739 */
740 <T> Page<T> allocate(int count) throws IOException {
741 assertLoaded();
742 if (count <= 0) {
743 throw new IllegalArgumentException("The allocation count must be larger than zero");
744 }
745
746 Sequence seq = freeList.removeFirstSequence(count);
747
748 // We may need to create new free pages...
749 if (seq == null) {
750
751 Page<T> first = null;
752 int c = count;
753 while (c > 0) {
754 Page<T> page = new Page<T>(nextFreePageId.getAndIncrement());
755 page.makeFree(getNextWriteTransactionId());
756
757 if (first == null) {
758 first = page;
759 }
760
761 addToCache(page);
762 DataByteArrayOutputStream out = new DataByteArrayOutputStream(pageSize);
763 page.write(out);
764 write(page, out.getData());
765
766 // LOG.debug("allocate writing: "+page.getPageId());
767 c--;
768 }
769
770 return first;
771 }
772
773 Page<T> page = new Page<T>(seq.getFirst());
774 page.makeFree(0);
775 // LOG.debug("allocated: "+page.getPageId());
776 return page;
777 }
778
779 long getNextWriteTransactionId() {
780 return nextTxid.incrementAndGet();
781 }
782
783 void readPage(long pageId, byte[] data) throws IOException {
784 readFile.seek(toOffset(pageId));
785 readFile.readFully(data);
786 }
787
788 public void freePage(long pageId) {
789 freeList.add(pageId);
790 if( enablePageCaching ) {
791 pageCache.remove(pageId);
792 }
793 }
794
795 @SuppressWarnings("unchecked")
796 private <T> void write(Page<T> page, byte[] data) throws IOException {
797 final PageWrite write = new PageWrite(page, data);
798 Entry<Long, PageWrite> entry = new Entry<Long, PageWrite>(){
799 public Long getKey() {
800 return write.getPage().getPageId();
801 }
802 public PageWrite getValue() {
803 return write;
804 }
805 public PageWrite setValue(PageWrite value) {
806 return null;
807 }
808 };
809 Entry<Long, PageWrite>[] entries = new Map.Entry[]{entry};
810 write(Arrays.asList(entries));
811 }
812
813 void write(Collection<Map.Entry<Long, PageWrite>> updates) throws IOException {
814 synchronized( writes ) {
815 if( enabledWriteThread ) {
816 while( writes.size() >= writeBatchSize && !stopWriter.get() ) {
817 try {
818 writes.wait();
819 } catch (InterruptedException e) {
820 Thread.currentThread().interrupt();
821 throw new InterruptedIOException();
822 }
823 }
824 }
825
826 for (Map.Entry<Long, PageWrite> entry : updates) {
827 Long key = entry.getKey();
828 PageWrite value = entry.getValue();
829 PageWrite write = writes.get(key);
830 if( write==null ) {
831 writes.put(key, value);
832 } else {
833 write.setCurrent(value.page, value.current);
834 }
835 }
836
837 // Once we start approaching capacity, notify the writer to start writing
838 if( canStartWriteBatch() ) {
839 if( enabledWriteThread ) {
840 writes.notify();
841 } else {
842 writeBatch();
843 }
844 }
845 }
846 }
847
848 private boolean canStartWriteBatch() {
849 int capacityUsed = ((writes.size() * 100)/writeBatchSize);
850 if( enabledWriteThread ) {
851 // The constant 10 here controls how soon write batches start going to disk..
852 // would be nice to figure out how to auto tune that value. Make to small and
853 // we reduce through put because we are locking the write mutex too often doing writes
854 return capacityUsed >= 10 || checkpointLatch!=null;
855 } else {
856 return capacityUsed >= 80 || checkpointLatch!=null;
857 }
858 }
859
860 ///////////////////////////////////////////////////////////////////
861 // Cache Related operations
862 ///////////////////////////////////////////////////////////////////
863 @SuppressWarnings("unchecked")
864 <T> Page<T> getFromCache(long pageId) {
865 synchronized(writes) {
866 PageWrite pageWrite = writes.get(pageId);
867 if( pageWrite != null ) {
868 return pageWrite.page;
869 }
870 }
871
872 Page<T> result = null;
873 if (enablePageCaching) {
874 result = pageCache.get(pageId);
875 }
876 return result;
877 }
878
879 void addToCache(Page page) {
880 if (enablePageCaching) {
881 pageCache.put(page.getPageId(), page);
882 }
883 }
884
885 void removeFromCache(Page page) {
886 if (enablePageCaching) {
887 pageCache.remove(page.getPageId());
888 }
889 }
890
891 ///////////////////////////////////////////////////////////////////
892 // Internal Double write implementation follows...
893 ///////////////////////////////////////////////////////////////////
894 /**
895 *
896 */
897 private void pollWrites() {
898 try {
899 while( !stopWriter.get() ) {
900 // Wait for a notification...
901 synchronized( writes ) {
902 writes.notifyAll();
903
904 // If there is not enough to write, wait for a notification...
905 while( writes.isEmpty() && checkpointLatch==null && !stopWriter.get() ) {
906 writes.wait(100);
907 }
908
909 if( writes.isEmpty() ) {
910 releaseCheckpointWaiter();
911 }
912 }
913 writeBatch();
914 }
915 } catch (Throwable e) {
916 e.printStackTrace();
917 } finally {
918 releaseCheckpointWaiter();
919 }
920 }
921
922 /**
923 *
924 * @param timeout
925 * @param unit
926 * @return true if there are still pending writes to do.
927 * @throws InterruptedException
928 * @throws IOException
929 */
930 private void writeBatch() throws IOException {
931
932 CountDownLatch checkpointLatch;
933 ArrayList<PageWrite> batch;
934 synchronized( writes ) {
935 // If there is not enough to write, wait for a notification...
936
937 batch = new ArrayList<PageWrite>(writes.size());
938 // build a write batch from the current write cache.
939 for (PageWrite write : writes.values()) {
940 batch.add(write);
941 // Move the current write to the diskBound write, this lets folks update the
942 // page again without blocking for this write.
943 write.begin();
944 if (write.diskBound == null) {
945 batch.remove(write);
946 }
947 }
948
949 // Grab on to the existing checkpoint latch cause once we do this write we can
950 // release the folks that were waiting for those writes to hit disk.
951 checkpointLatch = this.checkpointLatch;
952 this.checkpointLatch=null;
953 }
954
955 try {
956 if (enableRecoveryFile) {
957
958 // Using Adler-32 instead of CRC-32 because it's much faster and
959 // it's
960 // weakness for short messages with few hundred bytes is not a
961 // factor in this case since we know
962 // our write batches are going to much larger.
963 Checksum checksum = new Adler32();
964 for (PageWrite w : batch) {
965 try {
966 checksum.update(w.diskBound, 0, pageSize);
967 } catch (Throwable t) {
968 throw IOExceptionSupport.create(
969 "Cannot create recovery file. Reason: " + t, t);
970 }
971 }
972
973 // Can we shrink the recovery buffer??
974 if (recoveryPageCount > recoveryFileMaxPageCount) {
975 int t = Math.max(recoveryFileMinPageCount, batch.size());
976 recoveryFile.setLength(recoveryFileSizeForPages(t));
977 }
978
979 // Record the page writes in the recovery buffer.
980 recoveryFile.seek(0);
981 // Store the next tx id...
982 recoveryFile.writeLong(nextTxid.get());
983 // Store the checksum for thw write batch so that on recovery we
984 // know if we have a consistent
985 // write batch on disk.
986 recoveryFile.writeLong(checksum.getValue());
987 // Write the # of pages that will follow
988 recoveryFile.writeInt(batch.size());
989
990 // Write the pages.
991 recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE);
992
993 for (PageWrite w : batch) {
994 recoveryFile.writeLong(w.page.getPageId());
995 recoveryFile.write(w.diskBound, 0, pageSize);
996 }
997
998 if (enableDiskSyncs) {
999 // Sync to make sure recovery buffer writes land on disk..
1000 recoveryFile.getFD().sync();
1001 }
1002
1003 recoveryPageCount = batch.size();
1004 }
1005
1006 for (PageWrite w : batch) {
1007 writeFile.seek(toOffset(w.page.getPageId()));
1008 writeFile.write(w.diskBound, 0, pageSize);
1009 w.done();
1010 }
1011
1012 // Sync again
1013 if (enableDiskSyncs) {
1014 writeFile.getFD().sync();
1015 }
1016
1017 } finally {
1018 synchronized (writes) {
1019 for (PageWrite w : batch) {
1020 // If there are no more pending writes, then remove it from
1021 // the write cache.
1022 if (w.isDone()) {
1023 writes.remove(w.page.getPageId());
1024 }
1025 }
1026 }
1027
1028 if( checkpointLatch!=null ) {
1029 checkpointLatch.countDown();
1030 }
1031 }
1032 }
1033
1034 private long recoveryFileSizeForPages(int pageCount) {
1035 return RECOVERY_FILE_HEADER_SIZE+((pageSize+8)*pageCount);
1036 }
1037
1038 private void releaseCheckpointWaiter() {
1039 if( checkpointLatch!=null ) {
1040 checkpointLatch.countDown();
1041 checkpointLatch=null;
1042 }
1043 }
1044
1045 /**
1046 * Inspects the recovery buffer and re-applies any
1047 * partially applied page writes.
1048 *
1049 * @return the next transaction id that can be used.
1050 * @throws IOException
1051 */
1052 private long redoRecoveryUpdates() throws IOException {
1053 if( !enableRecoveryFile ) {
1054 return 0;
1055 }
1056 recoveryPageCount=0;
1057
1058 // Are we initializing the recovery file?
1059 if( recoveryFile.length() == 0 ) {
1060 // Write an empty header..
1061 recoveryFile.write(new byte[RECOVERY_FILE_HEADER_SIZE]);
1062 // Preallocate the minium size for better performance.
1063 recoveryFile.setLength(recoveryFileSizeForPages(recoveryFileMinPageCount));
1064 return 0;
1065 }
1066
1067 // How many recovery pages do we have in the recovery buffer?
1068 recoveryFile.seek(0);
1069 long nextTxId = recoveryFile.readLong();
1070 long expectedChecksum = recoveryFile.readLong();
1071 int pageCounter = recoveryFile.readInt();
1072
1073 recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE);
1074 Checksum checksum = new Adler32();
1075 LinkedHashMap<Long, byte[]> batch = new LinkedHashMap<Long, byte[]>();
1076 try {
1077 for (int i = 0; i < pageCounter; i++) {
1078 long offset = recoveryFile.readLong();
1079 byte []data = new byte[pageSize];
1080 if( recoveryFile.read(data, 0, pageSize) != pageSize ) {
1081 // Invalid recovery record, Could not fully read the data". Probably due to a partial write to the recovery buffer
1082 return nextTxId;
1083 }
1084 checksum.update(data, 0, pageSize);
1085 batch.put(offset, data);
1086 }
1087 } catch (Exception e) {
1088 // If an error occurred it was cause the redo buffer was not full written out correctly.. so don't redo it.
1089 // as the pages should still be consistent.
1090 LOG.debug("Redo buffer was not fully intact: ", e);
1091 return nextTxId;
1092 }
1093
1094 recoveryPageCount = pageCounter;
1095
1096 // If the checksum is not valid then the recovery buffer was partially written to disk.
1097 if( checksum.getValue() != expectedChecksum ) {
1098 return nextTxId;
1099 }
1100
1101 // Re-apply all the writes in the recovery buffer.
1102 for (Map.Entry<Long, byte[]> e : batch.entrySet()) {
1103 writeFile.seek(toOffset(e.getKey()));
1104 writeFile.write(e.getValue());
1105 }
1106
1107 // And sync it to disk
1108 writeFile.getFD().sync();
1109 return nextTxId;
1110 }
1111
1112 private void startWriter() {
1113 synchronized( writes ) {
1114 if( enabledWriteThread ) {
1115 stopWriter.set(false);
1116 writerThread = new Thread("KahaDB Page Writer") {
1117 @Override
1118 public void run() {
1119 pollWrites();
1120 }
1121 };
1122 writerThread.setPriority(Thread.MAX_PRIORITY);
1123 writerThread.setDaemon(true);
1124 writerThread.start();
1125 }
1126 }
1127 }
1128
1129 private void stopWriter() throws InterruptedException {
1130 if( enabledWriteThread ) {
1131 stopWriter.set(true);
1132 writerThread.join();
1133 }
1134 }
1135
1136 public File getFile() {
1137 return getMainPageFile();
1138 }
1139
1140 }