001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.kahadb.page;
018    
019    import java.io.ByteArrayInputStream;
020    import java.io.ByteArrayOutputStream;
021    import java.io.DataInputStream;
022    import java.io.DataOutputStream;
023    import java.io.File;
024    import java.io.FileInputStream;
025    import java.io.FileOutputStream;
026    import java.io.IOException;
027    import java.io.InterruptedIOException;
028    import java.io.RandomAccessFile;
029    import java.util.*;
030    import java.util.Map.Entry;
031    import java.util.concurrent.CountDownLatch;
032    import java.util.concurrent.atomic.AtomicBoolean;
033    import java.util.concurrent.atomic.AtomicLong;
034    import java.util.zip.Adler32;
035    import java.util.zip.Checksum;
036    
037    import org.apache.commons.logging.Log;
038    import org.apache.commons.logging.LogFactory;
039    import org.apache.kahadb.util.DataByteArrayOutputStream;
040    import org.apache.kahadb.util.IOExceptionSupport;
041    import org.apache.kahadb.util.IOHelper;
042    import org.apache.kahadb.util.IntrospectionSupport;
043    import org.apache.kahadb.util.LRUCache;
044    import org.apache.kahadb.util.Sequence;
045    import org.apache.kahadb.util.SequenceSet;
046    
047    /**
048     * A PageFile provides you random access to fixed sized disk pages. This object is not thread safe and therefore access to it should 
049     * be externally synchronized.
050     * 
051     * The file has 3 parts:
052     * Metadata Space: 4k : Reserved metadata area. Used to store persistent config about the file.
053     * Recovery Buffer Space: Page Size * 1000 : This is a redo log used to prevent partial page writes from making the file inconsistent
054     * Page Space: The pages in the page file.
055     * 
056     * @version $Revision: 1038566 $
057     */
058    public class PageFile {
059        
060        private static final String PAGEFILE_SUFFIX = ".data";
061        private static final String RECOVERY_FILE_SUFFIX = ".redo";
062        private static final String FREE_FILE_SUFFIX = ".free";
063        
064        // 4k Default page size.
065        public static final int DEFAULT_PAGE_SIZE = Integer.parseInt(System.getProperty("defaultPageSize", ""+1024*4)); 
066        public static final int DEFAULT_WRITE_BATCH_SIZE = Integer.parseInt(System.getProperty("defaultWriteBatchSize", ""+1000));
067        private static final int RECOVERY_FILE_HEADER_SIZE=1024*4;
068        private static final int PAGE_FILE_HEADER_SIZE=1024*4;
069    
070        // Recovery header is (long offset)
071        private static final Log LOG = LogFactory.getLog(PageFile.class);
072    
073        // A PageFile will use a couple of files in this directory
074        private File directory;
075        // And the file names in that directory will be based on this name.
076        private final String name;
077        
078        // File handle used for reading pages..
079        private RandomAccessFile readFile;
080        // File handle used for writing pages..
081        private RandomAccessFile writeFile;
082        // File handle used for writing pages..
083        private RandomAccessFile recoveryFile;
084    
085        // The size of pages
086        private int pageSize = DEFAULT_PAGE_SIZE;
087        
088        // The minimum number of space allocated to the recovery file in number of pages.
089        private int recoveryFileMinPageCount = 1000;
090        // The max size that we let the recovery file grow to.. ma exceed the max, but the file will get resize 
091        // to this max size as soon as  possible.
092        private int recoveryFileMaxPageCount = 10000;
093        // The number of pages in the current recovery buffer
094        private int recoveryPageCount;
095    
096        private AtomicBoolean loaded = new AtomicBoolean();
097        // The number of pages we are aiming to write every time we 
098        // write to disk.
099        int writeBatchSize = DEFAULT_WRITE_BATCH_SIZE;
100    
101        // We keep a cache of pages recently used?
102        private Map<Long, Page> pageCache;
103        // The cache of recently used pages.
104        private boolean enablePageCaching=true;
105        // How many pages will we keep in the cache?
106        private int pageCacheSize = 100;
107        
108        // Should first log the page write to the recovery buffer? Avoids partial
109        // page write failures..
110        private boolean enableRecoveryFile=true;
111        // Will we sync writes to disk. Ensures that data will not be lost after a checkpoint()
112        private boolean enableDiskSyncs=true;
113        // Will writes be done in an async thread?
114        private boolean enabledWriteThread=false;
115    
116        // These are used if enableAsyncWrites==true 
117        private AtomicBoolean stopWriter = new AtomicBoolean();
118        private Thread writerThread;
119        private CountDownLatch checkpointLatch;
120    
121        // Keeps track of writes that are being written to disk.
122        private TreeMap<Long, PageWrite> writes=new TreeMap<Long, PageWrite>();
123    
124        // Keeps track of free pages.
125        private final AtomicLong nextFreePageId = new AtomicLong();
126        private SequenceSet freeList = new SequenceSet();
127        
128        private AtomicLong nextTxid = new AtomicLong();
129        
130        // Persistent settings stored in the page file. 
131        private MetaData metaData;
132        
133        /**
134         * Use to keep track of updated pages which have not yet been committed.
135         */
136        static class PageWrite {
137            Page page;
138            byte[] current;
139            byte[] diskBound;
140    
141            public PageWrite(Page page, byte[] data) {
142                this.page=page;
143                current=data;
144            }
145                    
146            public void setCurrent(Page page, byte[] data) {
147                this.page=page;
148                current=data;
149            }
150    
151            @Override
152            public String toString() {
153                return "[PageWrite:"+page.getPageId()+"]";
154            }
155    
156            @SuppressWarnings("unchecked")
157            public Page getPage() {
158                return page;
159            }
160            
161            void begin() {
162               diskBound = current;
163               current = null;
164            }
165            
166            /**
167             * @return true if there is no pending writes to do.
168             */
169            boolean done() {
170                diskBound=null;
171                return current == null;
172            }
173            
174            boolean isDone() {
175                return diskBound == null && current == null;
176            }
177    
178        }
179        
180        /**
181         * The MetaData object hold the persistent data associated with a PageFile object. 
182         */
183        public static class MetaData {
184            
185            String fileType;
186            String fileTypeVersion;
187            
188            long metaDataTxId=-1;
189            int pageSize;
190            boolean cleanShutdown;
191            long lastTxId;
192            long freePages;
193            
194            public String getFileType() {
195                return fileType;
196            }
197            public void setFileType(String fileType) {
198                this.fileType = fileType;
199            }
200            public String getFileTypeVersion() {
201                return fileTypeVersion;
202            }
203            public void setFileTypeVersion(String version) {
204                this.fileTypeVersion = version;
205            }
206            public long getMetaDataTxId() {
207                return metaDataTxId;
208            }
209            public void setMetaDataTxId(long metaDataTxId) {
210                this.metaDataTxId = metaDataTxId;
211            }
212            public int getPageSize() {
213                return pageSize;
214            }
215            public void setPageSize(int pageSize) {
216                this.pageSize = pageSize;
217            }
218            public boolean isCleanShutdown() {
219                return cleanShutdown;
220            }
221            public void setCleanShutdown(boolean cleanShutdown) {
222                this.cleanShutdown = cleanShutdown;
223            }
224            public long getLastTxId() {
225                return lastTxId;
226            }
227            public void setLastTxId(long lastTxId) {
228                this.lastTxId = lastTxId;
229            }
230            public long getFreePages() {
231                return freePages;
232            }
233            public void setFreePages(long value) {
234                this.freePages = value;
235            }
236        }
237    
238        public Transaction tx() {
239            assertLoaded();
240            return new Transaction(this);
241        }
242        
243        /**
244         * Creates a PageFile in the specified directory who's data files are named by name.
245         * 
246         * @param directory
247         * @param name
248         */
249        public PageFile(File directory, String name) {
250            this.directory = directory;
251            this.name = name;
252        }
253        
254        /**
255         * Deletes the files used by the PageFile object.  This method can only be used when this object is not loaded.
256         * 
257         * @throws IOException 
258         *         if the files cannot be deleted.
259         * @throws IllegalStateException 
260         *         if this PageFile is loaded
261         */
262        public void delete() throws IOException {
263            if( loaded.get() ) {
264                throw new IllegalStateException("Cannot delete page file data when the page file is loaded");
265            }
266            delete(getMainPageFile());
267            delete(getFreeFile());
268            delete(getRecoveryFile());
269        }
270    
271        /**
272         * @param file
273         * @throws IOException
274         */
275        private void delete(File file) throws IOException {
276            if( file.exists() ) {
277                if( !file.delete() ) {
278                    throw new IOException("Could not delete: "+file.getPath());
279                }
280            }
281        }
282        
283        /**
284         * Loads the page file so that it can be accessed for read/write purposes.  This allocates OS resources.  If this is the 
285         * first time the page file is loaded, then this creates the page file in the file system.
286         * 
287         * @throws IOException
288         *         If the page file cannot be loaded. This could be cause the existing page file is corrupt is a bad version or if 
289         *         there was a disk error.
290         * @throws IllegalStateException 
291         *         If the page file was already loaded.
292         */
293        public void load() throws IOException, IllegalStateException {
294            if (loaded.compareAndSet(false, true)) {
295                
296                if( enablePageCaching ) {
297                    pageCache = Collections.synchronizedMap(new LRUCache<Long, Page>(pageCacheSize, pageCacheSize, 0.75f, true));
298                }
299                
300                File file = getMainPageFile();
301                IOHelper.mkdirs(file.getParentFile());
302                writeFile = new RandomAccessFile(file, "rw");
303                readFile = new RandomAccessFile(file, "r");
304                
305                if (readFile.length() > 0) {
306                    // Load the page size setting cause that can't change once the file is created.
307                    loadMetaData();
308                    pageSize = metaData.getPageSize();
309                } else {
310                    // Store the page size setting cause that can't change once the file is created.
311                    metaData = new MetaData();
312                    metaData.setFileType(PageFile.class.getName());
313                    metaData.setFileTypeVersion("1");
314                    metaData.setPageSize(getPageSize());
315                    metaData.setCleanShutdown(true);
316                    metaData.setFreePages(-1);
317                    metaData.setLastTxId(0);
318                    storeMetaData();
319                }
320    
321                if( enableRecoveryFile ) {
322                    recoveryFile = new RandomAccessFile(getRecoveryFile(), "rw");
323                }
324                
325                if(  metaData.isCleanShutdown() ) {
326                    nextTxid.set(metaData.getLastTxId()+1);
327                    if( metaData.getFreePages()>0 ) {
328                        loadFreeList();
329                    } 
330                } else {
331                    LOG.debug(toString() + ", Recovering page file...");
332                    nextTxid.set(redoRecoveryUpdates());
333                    
334                    // Scan all to find the free pages.
335                    freeList = new SequenceSet();
336                    for (Iterator i = tx().iterator(true); i.hasNext();) {
337                        Page page = (Page)i.next();
338                        if( page.getType() == Page.PAGE_FREE_TYPE ) {
339                            freeList.add(page.getPageId());
340                        }
341                    }
342                    
343                }
344                
345                metaData.setCleanShutdown(false);
346                storeMetaData();
347                getFreeFile().delete();
348                
349                if( writeFile.length() < PAGE_FILE_HEADER_SIZE) {
350                    writeFile.setLength(PAGE_FILE_HEADER_SIZE);
351                }
352                nextFreePageId.set((writeFile.length()-PAGE_FILE_HEADER_SIZE)/pageSize);
353                startWriter();
354                    
355            } else {
356                throw new IllegalStateException("Cannot load the page file when it is allready loaded.");
357            }
358        }
359    
360    
361        /**
362         * Unloads a previously loaded PageFile.  This deallocates OS related resources like file handles.
363         * once unloaded, you can no longer use the page file to read or write Pages.
364         * 
365         * @throws IOException
366         *         if there was a disk error occurred while closing the down the page file.
367         * @throws IllegalStateException
368         *         if the PageFile is not loaded
369         */
370        public void unload() throws IOException {
371            if (loaded.compareAndSet(true, false)) {
372                flush();
373                try {
374                    stopWriter();
375                } catch (InterruptedException e) {
376                    throw new InterruptedIOException();
377                }
378                
379                if( freeList.isEmpty() ) {
380                    metaData.setFreePages(0);
381                } else {
382                    storeFreeList();
383                    metaData.setFreePages(freeList.size());
384                }
385                
386                metaData.setLastTxId( nextTxid.get()-1 );
387                metaData.setCleanShutdown(true);
388                storeMetaData();
389                
390                if (readFile != null) {
391                    readFile.close();
392                    readFile = null;
393                    writeFile.close();
394                    writeFile=null;
395                    if( enableRecoveryFile ) {
396                        recoveryFile.close();
397                        recoveryFile=null;
398                    }
399                    freeList.clear();
400                    if( pageCache!=null ) {
401                        pageCache=null;
402                    }
403                    synchronized(writes) {
404                        writes.clear();
405                    }
406                }
407            } else {
408                throw new IllegalStateException("Cannot unload the page file when it is not loaded");
409            }
410        }
411            
412        public boolean isLoaded() {
413            return loaded.get();
414        }
415    
416        /**
417         * Flush and sync all write buffers to disk.
418         * 
419         * @throws IOException
420         *         If an disk error occurred.
421         */
422        public void flush() throws IOException {
423    
424            if( enabledWriteThread && stopWriter.get() ) {
425                throw new IOException("Page file already stopped: checkpointing is not allowed");
426            }
427            
428            // Setup a latch that gets notified when all buffered writes hits the disk.
429            CountDownLatch checkpointLatch;
430            synchronized( writes ) {
431                if( writes.isEmpty()) {                
432                    return;
433                }
434                if( enabledWriteThread ) {
435                    if( this.checkpointLatch == null ) {
436                        this.checkpointLatch = new CountDownLatch(1);
437                    }
438                    checkpointLatch = this.checkpointLatch;
439                    writes.notify();
440                } else {
441                    writeBatch();
442                    return;
443                }
444            }
445            try {
446                int size = writes.size();
447                long start = System.currentTimeMillis();
448                checkpointLatch.await();        
449                long end = System.currentTimeMillis();
450                if( end-start > 100 ) {
451                    LOG.warn("KahaDB PageFile flush: " + size + " queued writes, latch wait took "+(end-start));
452                }
453            } catch (InterruptedException e) {
454                throw new InterruptedIOException();
455            }
456        }
457    
458        
459        public String toString() {
460            return "Page File: "+getMainPageFile();
461        }
462        
463        ///////////////////////////////////////////////////////////////////
464        // Private Implementation Methods
465        ///////////////////////////////////////////////////////////////////
466        private File getMainPageFile() {
467            return new File(directory, IOHelper.toFileSystemSafeName(name)+PAGEFILE_SUFFIX);
468        }
469        
470        public File getFreeFile() {
471            return new File(directory, IOHelper.toFileSystemSafeName(name)+FREE_FILE_SUFFIX);
472        } 
473    
474        public File getRecoveryFile() {
475            return new File(directory, IOHelper.toFileSystemSafeName(name)+RECOVERY_FILE_SUFFIX);
476        } 
477    
478        private long toOffset(long pageId) {
479            return PAGE_FILE_HEADER_SIZE+(pageId*pageSize);
480        }
481    
482        private void loadMetaData() throws IOException {
483    
484            ByteArrayInputStream is;
485            MetaData v1 = new MetaData();
486            MetaData v2 = new MetaData();
487            try {
488                Properties p = new Properties();
489                byte[] d = new byte[PAGE_FILE_HEADER_SIZE/2];
490                readFile.seek(0);
491                readFile.readFully(d);
492                is = new ByteArrayInputStream(d);
493                p.load(is);
494                IntrospectionSupport.setProperties(v1, p);
495            } catch (IOException e) {
496                v1 = null;
497            }
498            
499            try {
500                Properties p = new Properties();
501                byte[] d = new byte[PAGE_FILE_HEADER_SIZE/2];
502                readFile.seek(PAGE_FILE_HEADER_SIZE/2);
503                readFile.readFully(d);
504                is = new ByteArrayInputStream(d);
505                p.load(is);
506                IntrospectionSupport.setProperties(v2, p);
507            } catch (IOException e) {
508                v2 = null;
509            }
510            
511            if( v1==null && v2==null ) {
512                throw new IOException("Could not load page file meta data");
513            } 
514            
515            if( v1 == null || v1.metaDataTxId<0 ) {
516                metaData = v2;
517            } else if( v2==null || v1.metaDataTxId<0 ) {
518                metaData = v1;
519            } else if( v1.metaDataTxId==v2.metaDataTxId ) {
520                metaData = v1; // use the first since the 2nd could be a partial..
521            } else {
522                metaData = v2; // use the second cause the first is probably a partial.
523            }
524        }
525        
526        private void storeMetaData() throws IOException {
527            // Convert the metadata into a property format
528            metaData.metaDataTxId++;
529            Properties p = new Properties();
530            IntrospectionSupport.getProperties(metaData, p, null);
531            
532            ByteArrayOutputStream os = new ByteArrayOutputStream(PAGE_FILE_HEADER_SIZE);
533            p.store(os, "");
534            if( os.size() > PAGE_FILE_HEADER_SIZE/2) { 
535                throw new IOException("Configuation is to larger than: "+PAGE_FILE_HEADER_SIZE/2);
536            }
537            // Fill the rest with space...
538            byte[] filler = new byte[(PAGE_FILE_HEADER_SIZE/2)-os.size()];
539            Arrays.fill(filler, (byte)' ');
540            os.write(filler);
541            os.flush();
542            
543            byte[] d = os.toByteArray();
544    
545            // So we don't loose it.. write it 2 times...
546            writeFile.seek(0);
547            writeFile.write(d);
548            writeFile.getFD().sync();
549            writeFile.seek(PAGE_FILE_HEADER_SIZE/2);
550            writeFile.write(d);
551            writeFile.getFD().sync();
552        }
553    
554        private void storeFreeList() throws IOException {
555            FileOutputStream os = new FileOutputStream(getFreeFile());
556            DataOutputStream dos = new DataOutputStream(os);
557            SequenceSet.Marshaller.INSTANCE.writePayload(freeList, dos);
558            dos.close();
559        }
560    
561        private void loadFreeList() throws IOException {
562            freeList.clear();
563            FileInputStream is = new FileInputStream(getFreeFile());
564            DataInputStream dis = new DataInputStream(is);
565            freeList = SequenceSet.Marshaller.INSTANCE.readPayload(dis);
566            dis.close();
567        }
568        
569        ///////////////////////////////////////////////////////////////////
570        // Property Accessors 
571        ///////////////////////////////////////////////////////////////////
572        
573        /**
574         * Is the recovery buffer used to double buffer page writes.  Enabled by default.
575         * 
576         * @return is the recovery buffer enabled.
577         */
578        public boolean isEnableRecoveryFile() {
579            return enableRecoveryFile;
580        }
581    
582        /**
583         * Sets if the recovery buffer uses to double buffer page writes.  Enabled by default.  Disabling this
584         * may potentially cause partial page writes which can lead to page file corruption.
585         */
586        public void setEnableRecoveryFile(boolean doubleBuffer) {
587            assertNotLoaded();
588            this.enableRecoveryFile = doubleBuffer;
589        }
590    
591        /**
592         * @return Are page writes synced to disk?
593         */
594        public boolean isEnableDiskSyncs() {
595            return enableDiskSyncs;
596        }
597    
598        /**
599         * Allows you enable syncing writes to disk.
600         * @param syncWrites
601         */
602        public void setEnableDiskSyncs(boolean syncWrites) {
603            assertNotLoaded();
604            this.enableDiskSyncs = syncWrites;
605        }
606        
607        /**
608         * @return the page size
609         */
610        public int getPageSize() {
611            return this.pageSize;
612        }
613    
614        /**
615         * @return the amount of content data that a page can hold.
616         */
617        public int getPageContentSize() {
618            return this.pageSize-Page.PAGE_HEADER_SIZE;
619        }
620        
621        /**
622         * Configures the page size used by the page file.  By default it is 4k.  Once a page file is created on disk,
623         * subsequent loads of that file will use the original pageSize.  Once the PageFile is loaded, this setting
624         * can no longer be changed.
625         * 
626         * @param pageSize the pageSize to set
627         * @throws IllegalStateException
628         *         once the page file is loaded.
629         */
630        public void setPageSize(int pageSize) throws IllegalStateException {
631            assertNotLoaded();
632            this.pageSize = pageSize;
633        }
634        
635        /**
636         * @return true if read page caching is enabled
637         */
638        public boolean isEnablePageCaching() {
639            return this.enablePageCaching;
640        }
641    
642        /**
643         * @param allows you to enable read page caching
644         */
645        public void setEnablePageCaching(boolean enablePageCaching) {
646            assertNotLoaded();
647            this.enablePageCaching = enablePageCaching;
648        }
649    
650        /**
651         * @return the maximum number of pages that will get stored in the read page cache.
652         */
653        public int getPageCacheSize() {
654            return this.pageCacheSize;
655        }
656    
657        /**
658         * @param Sets the maximum number of pages that will get stored in the read page cache.
659         */
660        public void setPageCacheSize(int pageCacheSize) {
661            assertNotLoaded();
662            this.pageCacheSize = pageCacheSize;
663        }
664    
665        public boolean isEnabledWriteThread() {
666            return enabledWriteThread;
667        }
668    
669        public void setEnableWriteThread(boolean enableAsyncWrites) {
670            assertNotLoaded();
671            this.enabledWriteThread = enableAsyncWrites;
672        }
673    
674        public long getDiskSize() throws IOException {
675            return toOffset(nextFreePageId.get());
676        }
677        
678        /**
679         * @return the number of pages allocated in the PageFile
680         */
681        public long getPageCount() {
682            return nextFreePageId.get();
683        }
684    
685        public int getRecoveryFileMinPageCount() {
686            return recoveryFileMinPageCount;
687        }
688    
689        public void setRecoveryFileMinPageCount(int recoveryFileMinPageCount) {
690            assertNotLoaded();
691            this.recoveryFileMinPageCount = recoveryFileMinPageCount;
692        }
693    
694        public int getRecoveryFileMaxPageCount() {
695            return recoveryFileMaxPageCount;
696        }
697    
698        public void setRecoveryFileMaxPageCount(int recoveryFileMaxPageCount) {
699            assertNotLoaded();
700            this.recoveryFileMaxPageCount = recoveryFileMaxPageCount;
701        }
702    
703            public int getWriteBatchSize() {
704                    return writeBatchSize;
705            }
706    
707            public void setWriteBatchSize(int writeBatchSize) {
708            assertNotLoaded();
709                    this.writeBatchSize = writeBatchSize;
710            }
711    
712            ///////////////////////////////////////////////////////////////////
713        // Package Protected Methods exposed to Transaction
714        ///////////////////////////////////////////////////////////////////
715    
716        /**
717         * @throws IllegalStateException if the page file is not loaded.
718         */
719        void assertLoaded() throws IllegalStateException {
720            if( !loaded.get() ) {
721                throw new IllegalStateException("PageFile is not loaded");
722            }
723        }
724        void assertNotLoaded() throws IllegalStateException {
725            if( loaded.get() ) {
726                throw new IllegalStateException("PageFile is loaded");
727            }
728        }
729            
730        /** 
731         * Allocates a block of free pages that you can write data to.
732         * 
733         * @param count the number of sequential pages to allocate
734         * @return the first page of the sequential set. 
735         * @throws IOException
736         *         If an disk error occurred.
737         * @throws IllegalStateException
738         *         if the PageFile is not loaded
739         */
740        <T> Page<T> allocate(int count) throws IOException {
741            assertLoaded();
742            if (count <= 0) {
743                throw new IllegalArgumentException("The allocation count must be larger than zero");
744            }
745    
746            Sequence seq = freeList.removeFirstSequence(count);
747    
748            // We may need to create new free pages...
749            if (seq == null) {
750    
751                Page<T> first = null;
752                int c = count;
753                while (c > 0) {
754                    Page<T> page = new Page<T>(nextFreePageId.getAndIncrement());
755                    page.makeFree(getNextWriteTransactionId());
756    
757                    if (first == null) {
758                        first = page;
759                    }
760    
761                    addToCache(page);
762                    DataByteArrayOutputStream out = new DataByteArrayOutputStream(pageSize);
763                    page.write(out);
764                    write(page, out.getData());
765    
766                    // LOG.debug("allocate writing: "+page.getPageId());
767                    c--;
768                }
769    
770                return first;
771            }
772    
773            Page<T> page = new Page<T>(seq.getFirst());
774            page.makeFree(0);
775            // LOG.debug("allocated: "+page.getPageId());
776            return page;
777        }
778    
779        long getNextWriteTransactionId() {
780            return nextTxid.incrementAndGet();
781        }
782    
783        void readPage(long pageId, byte[] data) throws IOException {
784            readFile.seek(toOffset(pageId));
785            readFile.readFully(data);
786        }
787    
788        public void freePage(long pageId) {
789            freeList.add(pageId);
790            if( enablePageCaching ) {
791                pageCache.remove(pageId);
792            }
793        }
794        
795        @SuppressWarnings("unchecked")
796        private <T> void write(Page<T> page, byte[] data) throws IOException {
797            final PageWrite write = new PageWrite(page, data);
798            Entry<Long, PageWrite> entry = new Entry<Long, PageWrite>(){
799                public Long getKey() {
800                    return write.getPage().getPageId();
801                }
802                public PageWrite getValue() {
803                    return write;
804                }
805                public PageWrite setValue(PageWrite value) {
806                    return null;
807                }
808            };
809            Entry<Long, PageWrite>[] entries = new Map.Entry[]{entry};
810            write(Arrays.asList(entries));
811        }
812    
813        void write(Collection<Map.Entry<Long, PageWrite>> updates) throws IOException {
814            synchronized( writes ) {
815                if( enabledWriteThread  ) {
816                    while( writes.size() >= writeBatchSize && !stopWriter.get() ) {
817                        try {
818                            writes.wait();
819                        } catch (InterruptedException e) {
820                            Thread.currentThread().interrupt();
821                            throw new InterruptedIOException();
822                        }
823                    }
824                }
825    
826                for (Map.Entry<Long, PageWrite> entry : updates) {
827                    Long key = entry.getKey();
828                    PageWrite value = entry.getValue();
829                    PageWrite write = writes.get(key);
830                    if( write==null ) {
831                        writes.put(key, value);
832                    } else {
833                        write.setCurrent(value.page, value.current);
834                    }
835                }
836                
837                // Once we start approaching capacity, notify the writer to start writing
838                if( canStartWriteBatch() ) {
839                    if( enabledWriteThread  ) {
840                        writes.notify();
841                    } else {
842                        writeBatch();
843                    }
844                }
845            }            
846        }
847        
848        private boolean canStartWriteBatch() {
849                    int capacityUsed = ((writes.size() * 100)/writeBatchSize);
850            if( enabledWriteThread ) {
851                // The constant 10 here controls how soon write batches start going to disk..
852                // would be nice to figure out how to auto tune that value.  Make to small and
853                // we reduce through put because we are locking the write mutex too often doing writes
854                return capacityUsed >= 10 || checkpointLatch!=null;
855            } else {
856                return capacityUsed >= 80 || checkpointLatch!=null;
857            }
858        }
859    
860        ///////////////////////////////////////////////////////////////////
861        // Cache Related operations
862        ///////////////////////////////////////////////////////////////////
863        @SuppressWarnings("unchecked")
864        <T> Page<T> getFromCache(long pageId) {
865            synchronized(writes) {
866                PageWrite pageWrite = writes.get(pageId);
867                if( pageWrite != null ) {
868                    return pageWrite.page;
869                }
870            }
871    
872            Page<T> result = null;
873            if (enablePageCaching) {
874                result = pageCache.get(pageId);
875            }
876            return result;
877        }
878    
879        void addToCache(Page page) {
880            if (enablePageCaching) {
881                pageCache.put(page.getPageId(), page);
882            }
883        }
884    
885        void removeFromCache(Page page) {
886            if (enablePageCaching) {
887                pageCache.remove(page.getPageId());
888            }
889        }
890    
891        ///////////////////////////////////////////////////////////////////
892        // Internal Double write implementation follows...
893        ///////////////////////////////////////////////////////////////////
894        /**
895         * 
896         */
897        private void pollWrites() {
898            try {
899                while( !stopWriter.get() ) {
900                    // Wait for a notification...
901                    synchronized( writes ) {  
902                        writes.notifyAll();
903                        
904                        // If there is not enough to write, wait for a notification...
905                        while( writes.isEmpty() && checkpointLatch==null && !stopWriter.get() ) {
906                            writes.wait(100);
907                        }
908                        
909                        if( writes.isEmpty() ) {
910                            releaseCheckpointWaiter();
911                        }
912                    }
913                    writeBatch();
914                }
915            } catch (Throwable e) {
916                e.printStackTrace();
917            } finally {
918                releaseCheckpointWaiter();
919            }
920        }
921    
922        /**
923         * 
924         * @param timeout
925         * @param unit
926         * @return true if there are still pending writes to do.
927         * @throws InterruptedException 
928         * @throws IOException 
929         */
930        private void writeBatch() throws IOException {
931                
932            CountDownLatch checkpointLatch;
933            ArrayList<PageWrite> batch;
934            synchronized( writes ) {
935                // If there is not enough to write, wait for a notification...
936    
937                batch = new ArrayList<PageWrite>(writes.size());
938                // build a write batch from the current write cache.
939                for (PageWrite write : writes.values()) {
940                    batch.add(write);
941                    // Move the current write to the diskBound write, this lets folks update the 
942                    // page again without blocking for this write.
943                    write.begin();
944                    if (write.diskBound == null) {
945                        batch.remove(write);
946                    }
947                }
948    
949                // Grab on to the existing checkpoint latch cause once we do this write we can 
950                // release the folks that were waiting for those writes to hit disk.
951                checkpointLatch = this.checkpointLatch;
952                this.checkpointLatch=null;
953            }
954            
955           try {
956                if (enableRecoveryFile) {
957    
958                    // Using Adler-32 instead of CRC-32 because it's much faster and
959                    // it's
960                    // weakness for short messages with few hundred bytes is not a
961                    // factor in this case since we know
962                    // our write batches are going to much larger.
963                    Checksum checksum = new Adler32();
964                    for (PageWrite w : batch) {
965                        try {
966                            checksum.update(w.diskBound, 0, pageSize);
967                        } catch (Throwable t) {
968                            throw IOExceptionSupport.create(
969                                    "Cannot create recovery file. Reason: " + t, t);
970                        }
971                    }
972    
973                    // Can we shrink the recovery buffer??
974                    if (recoveryPageCount > recoveryFileMaxPageCount) {
975                        int t = Math.max(recoveryFileMinPageCount, batch.size());
976                        recoveryFile.setLength(recoveryFileSizeForPages(t));
977                    }
978    
979                    // Record the page writes in the recovery buffer.
980                    recoveryFile.seek(0);
981                    // Store the next tx id...
982                    recoveryFile.writeLong(nextTxid.get());
983                    // Store the checksum for thw write batch so that on recovery we
984                    // know if we have a consistent
985                    // write batch on disk.
986                    recoveryFile.writeLong(checksum.getValue());
987                    // Write the # of pages that will follow
988                    recoveryFile.writeInt(batch.size());
989    
990                    // Write the pages.
991                    recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE);
992    
993                    for (PageWrite w : batch) {
994                        recoveryFile.writeLong(w.page.getPageId());
995                        recoveryFile.write(w.diskBound, 0, pageSize);
996                    }
997    
998                    if (enableDiskSyncs) {
999                        // Sync to make sure recovery buffer writes land on disk..
1000                        recoveryFile.getFD().sync();
1001                    }
1002    
1003                    recoveryPageCount = batch.size();
1004                }
1005    
1006                for (PageWrite w : batch) {
1007                    writeFile.seek(toOffset(w.page.getPageId()));
1008                    writeFile.write(w.diskBound, 0, pageSize);
1009                    w.done();
1010                }
1011    
1012                // Sync again
1013                if (enableDiskSyncs) {
1014                    writeFile.getFD().sync();
1015                }
1016    
1017            } finally {
1018                synchronized (writes) {
1019                    for (PageWrite w : batch) {
1020                        // If there are no more pending writes, then remove it from
1021                        // the write cache.
1022                        if (w.isDone()) {
1023                            writes.remove(w.page.getPageId());
1024                        }
1025                    }
1026                }
1027                
1028                if( checkpointLatch!=null ) {
1029                    checkpointLatch.countDown();
1030                }
1031            }
1032        }
1033    
1034        private long recoveryFileSizeForPages(int pageCount) {
1035            return RECOVERY_FILE_HEADER_SIZE+((pageSize+8)*pageCount);
1036        }
1037    
1038        private void releaseCheckpointWaiter() {
1039            if( checkpointLatch!=null ) {
1040                checkpointLatch.countDown();
1041                checkpointLatch=null;
1042            }
1043        }       
1044        
1045        /**
1046         * Inspects the recovery buffer and re-applies any 
1047         * partially applied page writes.
1048         * 
1049         * @return the next transaction id that can be used.
1050         * @throws IOException
1051         */
1052        private long redoRecoveryUpdates() throws IOException {
1053            if( !enableRecoveryFile ) {
1054                return 0;
1055            }
1056            recoveryPageCount=0;
1057            
1058            // Are we initializing the recovery file?
1059            if( recoveryFile.length() == 0 ) {
1060                // Write an empty header..
1061                recoveryFile.write(new byte[RECOVERY_FILE_HEADER_SIZE]);
1062                // Preallocate the minium size for better performance.
1063                recoveryFile.setLength(recoveryFileSizeForPages(recoveryFileMinPageCount));
1064                return 0;
1065            }
1066            
1067            // How many recovery pages do we have in the recovery buffer?
1068            recoveryFile.seek(0);
1069            long nextTxId = recoveryFile.readLong();
1070            long expectedChecksum = recoveryFile.readLong();
1071            int pageCounter = recoveryFile.readInt();
1072            
1073            recoveryFile.seek(RECOVERY_FILE_HEADER_SIZE);
1074            Checksum checksum = new Adler32();
1075            LinkedHashMap<Long, byte[]> batch = new LinkedHashMap<Long, byte[]>();
1076            try {
1077                for (int i = 0; i < pageCounter; i++) {
1078                    long offset = recoveryFile.readLong();
1079                    byte []data = new byte[pageSize];
1080                    if( recoveryFile.read(data, 0, pageSize) != pageSize ) {
1081                        // Invalid recovery record, Could not fully read the data". Probably due to a partial write to the recovery buffer
1082                        return nextTxId;
1083                    }
1084                    checksum.update(data, 0, pageSize);
1085                    batch.put(offset, data);
1086                }
1087            } catch (Exception e) {
1088                // If an error occurred it was cause the redo buffer was not full written out correctly.. so don't redo it. 
1089                // as the pages should still be consistent.
1090                LOG.debug("Redo buffer was not fully intact: ", e);
1091                return nextTxId;
1092            }
1093            
1094            recoveryPageCount = pageCounter;
1095            
1096            // If the checksum is not valid then the recovery buffer was partially written to disk.
1097            if( checksum.getValue() != expectedChecksum ) {
1098                return nextTxId;
1099            }
1100            
1101            // Re-apply all the writes in the recovery buffer.
1102            for (Map.Entry<Long, byte[]> e : batch.entrySet()) {
1103                writeFile.seek(toOffset(e.getKey()));
1104                writeFile.write(e.getValue());
1105            }
1106            
1107            // And sync it to disk
1108            writeFile.getFD().sync();
1109            return nextTxId;
1110        }
1111    
1112        private void startWriter() {
1113            synchronized( writes ) {
1114                if( enabledWriteThread ) {
1115                    stopWriter.set(false);
1116                    writerThread = new Thread("KahaDB Page Writer") {
1117                        @Override
1118                        public void run() {
1119                            pollWrites();
1120                        }
1121                    };
1122                    writerThread.setPriority(Thread.MAX_PRIORITY);
1123                    writerThread.setDaemon(true);
1124                    writerThread.start();
1125                }
1126            }
1127        }
1128     
1129        private void stopWriter() throws InterruptedException {
1130            if( enabledWriteThread ) {
1131                stopWriter.set(true);
1132                writerThread.join();
1133            }
1134        }
1135    
1136            public File getFile() {
1137                    return getMainPageFile();
1138            }
1139    
1140    }