001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.ha;
020
021 import java.io.IOException;
022 import java.util.Arrays;
023 import java.util.List;
024 import java.util.concurrent.CountDownLatch;
025 import java.util.concurrent.TimeUnit;
026 import java.util.concurrent.locks.Lock;
027 import java.util.concurrent.locks.ReentrantLock;
028
029 import org.apache.commons.logging.Log;
030 import org.apache.commons.logging.LogFactory;
031 import org.apache.hadoop.HadoopIllegalArgumentException;
032 import org.apache.hadoop.classification.InterfaceAudience;
033 import org.apache.hadoop.classification.InterfaceStability;
034 import org.apache.hadoop.util.ZKUtil.ZKAuthInfo;
035 import org.apache.hadoop.util.StringUtils;
036 import org.apache.zookeeper.data.ACL;
037 import org.apache.zookeeper.KeeperException;
038 import org.apache.zookeeper.Watcher;
039 import org.apache.zookeeper.WatchedEvent;
040 import org.apache.zookeeper.Watcher.Event;
041 import org.apache.zookeeper.ZKUtil;
042 import org.apache.zookeeper.ZooKeeper;
043 import org.apache.zookeeper.CreateMode;
044 import org.apache.zookeeper.AsyncCallback.*;
045 import org.apache.zookeeper.data.Stat;
046 import org.apache.zookeeper.KeeperException.Code;
047
048 import com.google.common.annotations.VisibleForTesting;
049 import com.google.common.base.Preconditions;
050
051 /**
052 *
053 * This class implements a simple library to perform leader election on top of
054 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election
055 * can be performed by atomically creating an ephemeral lock file (znode) on
056 * Zookeeper. The service instance that successfully creates the znode becomes
057 * active and the rest become standbys. <br/>
058 * This election mechanism is only efficient for small number of election
059 * candidates (order of 10's) because contention on single znode by a large
060 * number of candidates can result in Zookeeper overload. <br/>
061 * The elector does not guarantee fencing (protection of shared resources) among
062 * service instances. After it has notified an instance about becoming a leader,
063 * then that instance must ensure that it meets the service consistency
064 * requirements. If it cannot do so, then it is recommended to quit the
065 * election. The application implements the {@link ActiveStandbyElectorCallback}
066 * to interact with the elector
067 */
068 @InterfaceAudience.Private
069 @InterfaceStability.Evolving
070 public class ActiveStandbyElector implements StatCallback, StringCallback {
071
072 /**
073 * Callback interface to interact with the ActiveStandbyElector object. <br/>
074 * The application will be notified with a callback only on state changes
075 * (i.e. there will never be successive calls to becomeActive without an
076 * intermediate call to enterNeutralMode). <br/>
077 * The callbacks will be running on Zookeeper client library threads. The
078 * application should return from these callbacks quickly so as not to impede
079 * Zookeeper client library performance and notifications. The app will
080 * typically remember the state change and return from the callback. It will
081 * then proceed with implementing actions around that state change. It is
082 * possible to be called back again while these actions are in flight and the
083 * app should handle this scenario.
084 */
085 public interface ActiveStandbyElectorCallback {
086 /**
087 * This method is called when the app becomes the active leader.
088 * If the service fails to become active, it should throw
089 * ServiceFailedException. This will cause the elector to
090 * sleep for a short period, then re-join the election.
091 *
092 * Callback implementations are expected to manage their own
093 * timeouts (e.g. when making an RPC to a remote node).
094 */
095 void becomeActive() throws ServiceFailedException;
096
097 /**
098 * This method is called when the app becomes a standby
099 */
100 void becomeStandby();
101
102 /**
103 * If the elector gets disconnected from Zookeeper and does not know about
104 * the lock state, then it will notify the service via the enterNeutralMode
105 * interface. The service may choose to ignore this or stop doing state
106 * changing operations. Upon reconnection, the elector verifies the leader
107 * status and calls back on the becomeActive and becomeStandby app
108 * interfaces. <br/>
109 * Zookeeper disconnects can happen due to network issues or loss of
110 * Zookeeper quorum. Thus enterNeutralMode can be used to guard against
111 * split-brain issues. In such situations it might be prudent to call
112 * becomeStandby too. However, such state change operations might be
113 * expensive and enterNeutralMode can help guard against doing that for
114 * transient issues.
115 */
116 void enterNeutralMode();
117
118 /**
119 * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper
120 * errors or Zookeeper persistent unavailability) then notifyFatalError is
121 * called to notify the app about it.
122 */
123 void notifyFatalError(String errorMessage);
124
125 /**
126 * If an old active has failed, rather than exited gracefully, then
127 * the new active may need to take some fencing actions against it
128 * before proceeding with failover.
129 *
130 * @param oldActiveData the application data provided by the prior active
131 */
132 void fenceOldActive(byte[] oldActiveData);
133 }
134
135 /**
136 * Name of the lock znode used by the library. Protected for access in test
137 * classes
138 */
139 @VisibleForTesting
140 protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock";
141 @VisibleForTesting
142 protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb";
143
144 public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class);
145
146 static int NUM_RETRIES = 3;
147 private static final int SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE = 1000;
148
149 private static enum ConnectionState {
150 DISCONNECTED, CONNECTED, TERMINATED
151 };
152
153 static enum State {
154 INIT, ACTIVE, STANDBY, NEUTRAL
155 };
156
157 private State state = State.INIT;
158 private int createRetryCount = 0;
159 private int statRetryCount = 0;
160 private ZooKeeper zkClient;
161 private WatcherWithClientRef watcher;
162 private ConnectionState zkConnectionState = ConnectionState.TERMINATED;
163
164 private final ActiveStandbyElectorCallback appClient;
165 private final String zkHostPort;
166 private final int zkSessionTimeout;
167 private final List<ACL> zkAcl;
168 private final List<ZKAuthInfo> zkAuthInfo;
169 private byte[] appData;
170 private final String zkLockFilePath;
171 private final String zkBreadCrumbPath;
172 private final String znodeWorkingDir;
173
174 private Lock sessionReestablishLockForTests = new ReentrantLock();
175 private boolean wantToBeInElection;
176
177 /**
178 * Create a new ActiveStandbyElector object <br/>
179 * The elector is created by providing to it the Zookeeper configuration, the
180 * parent znode under which to create the znode and a reference to the
181 * callback interface. <br/>
182 * The parent znode name must be the same for all service instances and
183 * different across services. <br/>
184 * After the leader has been lost, a new leader will be elected after the
185 * session timeout expires. Hence, the app must set this parameter based on
186 * its needs for failure response time. The session timeout must be greater
187 * than the Zookeeper disconnect timeout and is recommended to be 3X that
188 * value to enable Zookeeper to retry transient disconnections. Setting a very
189 * short session timeout may result in frequent transitions between active and
190 * standby states during issues like network outages/GS pauses.
191 *
192 * @param zookeeperHostPorts
193 * ZooKeeper hostPort for all ZooKeeper servers
194 * @param zookeeperSessionTimeout
195 * ZooKeeper session timeout
196 * @param parentZnodeName
197 * znode under which to create the lock
198 * @param acl
199 * ZooKeeper ACL's
200 * @param authInfo a list of authentication credentials to add to the
201 * ZK connection
202 * @param app
203 * reference to callback interface object
204 * @throws IOException
205 * @throws HadoopIllegalArgumentException
206 */
207 public ActiveStandbyElector(String zookeeperHostPorts,
208 int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
209 List<ZKAuthInfo> authInfo,
210 ActiveStandbyElectorCallback app) throws IOException,
211 HadoopIllegalArgumentException, KeeperException {
212 if (app == null || acl == null || parentZnodeName == null
213 || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
214 throw new HadoopIllegalArgumentException("Invalid argument");
215 }
216 zkHostPort = zookeeperHostPorts;
217 zkSessionTimeout = zookeeperSessionTimeout;
218 zkAcl = acl;
219 zkAuthInfo = authInfo;
220 appClient = app;
221 znodeWorkingDir = parentZnodeName;
222 zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME;
223 zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
224
225 // createConnection for future API calls
226 createConnection();
227 }
228
229 /**
230 * To participate in election, the app will call joinElection. The result will
231 * be notified by a callback on either the becomeActive or becomeStandby app
232 * interfaces. <br/>
233 * After this the elector will automatically monitor the leader status and
234 * perform re-election if necessary<br/>
235 * The app could potentially start off in standby mode and ignore the
236 * becomeStandby call.
237 *
238 * @param data
239 * to be set by the app. non-null data must be set.
240 * @throws HadoopIllegalArgumentException
241 * if valid data is not supplied
242 */
243 public synchronized void joinElection(byte[] data)
244 throws HadoopIllegalArgumentException {
245
246 if (data == null) {
247 throw new HadoopIllegalArgumentException("data cannot be null");
248 }
249
250 if (wantToBeInElection) {
251 LOG.info("Already in election. Not re-connecting.");
252 return;
253 }
254
255 appData = new byte[data.length];
256 System.arraycopy(data, 0, appData, 0, data.length);
257
258 LOG.debug("Attempting active election for " + this);
259 joinElectionInternal();
260 }
261
262 /**
263 * @return true if the configured parent znode exists
264 */
265 public synchronized boolean parentZNodeExists()
266 throws IOException, InterruptedException {
267 Preconditions.checkState(zkClient != null);
268 try {
269 return zkClient.exists(znodeWorkingDir, false) != null;
270 } catch (KeeperException e) {
271 throw new IOException("Couldn't determine existence of znode '" +
272 znodeWorkingDir + "'", e);
273 }
274 }
275
276 /**
277 * Utility function to ensure that the configured base znode exists.
278 * This recursively creates the znode as well as all of its parents.
279 */
280 public synchronized void ensureParentZNode()
281 throws IOException, InterruptedException {
282 Preconditions.checkState(!wantToBeInElection,
283 "ensureParentZNode() may not be called while in the election");
284
285 String pathParts[] = znodeWorkingDir.split("/");
286 Preconditions.checkArgument(pathParts.length >= 1 &&
287 pathParts[0].isEmpty(),
288 "Invalid path: %s", znodeWorkingDir);
289
290 StringBuilder sb = new StringBuilder();
291 for (int i = 1; i < pathParts.length; i++) {
292 sb.append("/").append(pathParts[i]);
293 String prefixPath = sb.toString();
294 LOG.debug("Ensuring existence of " + prefixPath);
295 try {
296 createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT);
297 } catch (KeeperException e) {
298 if (isNodeExists(e.code())) {
299 // This is OK - just ensuring existence.
300 continue;
301 } else {
302 throw new IOException("Couldn't create " + prefixPath, e);
303 }
304 }
305 }
306
307 LOG.info("Successfully created " + znodeWorkingDir + " in ZK.");
308 }
309
310 /**
311 * Clear all of the state held within the parent ZNode.
312 * This recursively deletes everything within the znode as well as the
313 * parent znode itself. It should only be used when it's certain that
314 * no electors are currently participating in the election.
315 */
316 public synchronized void clearParentZNode()
317 throws IOException, InterruptedException {
318 Preconditions.checkState(!wantToBeInElection,
319 "clearParentZNode() may not be called while in the election");
320
321 try {
322 LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
323
324 zkDoWithRetries(new ZKAction<Void>() {
325 @Override
326 public Void run() throws KeeperException, InterruptedException {
327 ZKUtil.deleteRecursive(zkClient, znodeWorkingDir);
328 return null;
329 }
330 });
331 } catch (KeeperException e) {
332 throw new IOException("Couldn't clear parent znode " + znodeWorkingDir,
333 e);
334 }
335 LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK.");
336 }
337
338
339 /**
340 * Any service instance can drop out of the election by calling quitElection.
341 * <br/>
342 * This will lose any leader status, if held, and stop monitoring of the lock
343 * node. <br/>
344 * If the instance wants to participate in election again, then it needs to
345 * call joinElection(). <br/>
346 * This allows service instances to take themselves out of rotation for known
347 * impending unavailable states (e.g. long GC pause or software upgrade).
348 *
349 * @param needFence true if the underlying daemon may need to be fenced
350 * if a failover occurs due to dropping out of the election.
351 */
352 public synchronized void quitElection(boolean needFence) {
353 LOG.info("Yielding from election");
354 if (!needFence && state == State.ACTIVE) {
355 // If active is gracefully going back to standby mode, remove
356 // our permanent znode so no one fences us.
357 tryDeleteOwnBreadCrumbNode();
358 }
359 reset();
360 wantToBeInElection = false;
361 }
362
363 /**
364 * Exception thrown when there is no active leader
365 */
366 public static class ActiveNotFoundException extends Exception {
367 private static final long serialVersionUID = 3505396722342846462L;
368 }
369
370 /**
371 * get data set by the active leader
372 *
373 * @return data set by the active instance
374 * @throws ActiveNotFoundException
375 * when there is no active leader
376 * @throws KeeperException
377 * other zookeeper operation errors
378 * @throws InterruptedException
379 * @throws IOException
380 * when ZooKeeper connection could not be established
381 */
382 public synchronized byte[] getActiveData() throws ActiveNotFoundException,
383 KeeperException, InterruptedException, IOException {
384 try {
385 if (zkClient == null) {
386 createConnection();
387 }
388 Stat stat = new Stat();
389 return getDataWithRetries(zkLockFilePath, false, stat);
390 } catch(KeeperException e) {
391 Code code = e.code();
392 if (isNodeDoesNotExist(code)) {
393 // handle the commonly expected cases that make sense for us
394 throw new ActiveNotFoundException();
395 } else {
396 throw e;
397 }
398 }
399 }
400
401 /**
402 * interface implementation of Zookeeper callback for create
403 */
404 @Override
405 public synchronized void processResult(int rc, String path, Object ctx,
406 String name) {
407 if (isStaleClient(ctx)) return;
408 LOG.debug("CreateNode result: " + rc + " for path: " + path
409 + " connectionState: " + zkConnectionState +
410 " for " + this);
411
412 Code code = Code.get(rc);
413 if (isSuccess(code)) {
414 // we successfully created the znode. we are the leader. start monitoring
415 if (becomeActive()) {
416 monitorActiveStatus();
417 } else {
418 reJoinElectionAfterFailureToBecomeActive();
419 }
420 return;
421 }
422
423 if (isNodeExists(code)) {
424 if (createRetryCount == 0) {
425 // znode exists and we did not retry the operation. so a different
426 // instance has created it. become standby and monitor lock.
427 becomeStandby();
428 }
429 // if we had retried then the znode could have been created by our first
430 // attempt to the server (that we lost) and this node exists response is
431 // for the second attempt. verify this case via ephemeral node owner. this
432 // will happen on the callback for monitoring the lock.
433 monitorActiveStatus();
434 return;
435 }
436
437 String errorMessage = "Received create error from Zookeeper. code:"
438 + code.toString() + " for path " + path;
439 LOG.debug(errorMessage);
440
441 if (shouldRetry(code)) {
442 if (createRetryCount < NUM_RETRIES) {
443 LOG.debug("Retrying createNode createRetryCount: " + createRetryCount);
444 ++createRetryCount;
445 createLockNodeAsync();
446 return;
447 }
448 errorMessage = errorMessage
449 + ". Not retrying further znode create connection errors.";
450 } else if (isSessionExpired(code)) {
451 // This isn't fatal - the client Watcher will re-join the election
452 LOG.warn("Lock acquisition failed because session was lost");
453 return;
454 }
455
456 fatalError(errorMessage);
457 }
458
459 /**
460 * interface implementation of Zookeeper callback for monitor (exists)
461 */
462 @Override
463 public synchronized void processResult(int rc, String path, Object ctx,
464 Stat stat) {
465 if (isStaleClient(ctx)) return;
466
467 assert wantToBeInElection :
468 "Got a StatNode result after quitting election";
469
470 LOG.debug("StatNode result: " + rc + " for path: " + path
471 + " connectionState: " + zkConnectionState + " for " + this);
472
473
474 Code code = Code.get(rc);
475 if (isSuccess(code)) {
476 // the following owner check completes verification in case the lock znode
477 // creation was retried
478 if (stat.getEphemeralOwner() == zkClient.getSessionId()) {
479 // we own the lock znode. so we are the leader
480 if (!becomeActive()) {
481 reJoinElectionAfterFailureToBecomeActive();
482 }
483 } else {
484 // we dont own the lock znode. so we are a standby.
485 becomeStandby();
486 }
487 // the watch set by us will notify about changes
488 return;
489 }
490
491 if (isNodeDoesNotExist(code)) {
492 // the lock znode disappeared before we started monitoring it
493 enterNeutralMode();
494 joinElectionInternal();
495 return;
496 }
497
498 String errorMessage = "Received stat error from Zookeeper. code:"
499 + code.toString();
500 LOG.debug(errorMessage);
501
502 if (shouldRetry(code)) {
503 if (statRetryCount < NUM_RETRIES) {
504 ++statRetryCount;
505 monitorLockNodeAsync();
506 return;
507 }
508 errorMessage = errorMessage
509 + ". Not retrying further znode monitoring connection errors.";
510 } else if (isSessionExpired(code)) {
511 // This isn't fatal - the client Watcher will re-join the election
512 LOG.warn("Lock monitoring failed because session was lost");
513 return;
514 }
515
516 fatalError(errorMessage);
517 }
518
519 /**
520 * We failed to become active. Re-join the election, but
521 * sleep for a few seconds after terminating our existing
522 * session, so that other nodes have a chance to become active.
523 * The failure to become active is already logged inside
524 * becomeActive().
525 */
526 private void reJoinElectionAfterFailureToBecomeActive() {
527 reJoinElection(SLEEP_AFTER_FAILURE_TO_BECOME_ACTIVE);
528 }
529
530 /**
531 * interface implementation of Zookeeper watch events (connection and node),
532 * proxied by {@link WatcherWithClientRef}.
533 */
534 synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
535 Event.EventType eventType = event.getType();
536 if (isStaleClient(zk)) return;
537 LOG.debug("Watcher event type: " + eventType + " with state:"
538 + event.getState() + " for path:" + event.getPath()
539 + " connectionState: " + zkConnectionState
540 + " for " + this);
541
542 if (eventType == Event.EventType.None) {
543 // the connection state has changed
544 switch (event.getState()) {
545 case SyncConnected:
546 LOG.info("Session connected.");
547 // if the listener was asked to move to safe state then it needs to
548 // be undone
549 ConnectionState prevConnectionState = zkConnectionState;
550 zkConnectionState = ConnectionState.CONNECTED;
551 if (prevConnectionState == ConnectionState.DISCONNECTED &&
552 wantToBeInElection) {
553 monitorActiveStatus();
554 }
555 break;
556 case Disconnected:
557 LOG.info("Session disconnected. Entering neutral mode...");
558
559 // ask the app to move to safe state because zookeeper connection
560 // is not active and we dont know our state
561 zkConnectionState = ConnectionState.DISCONNECTED;
562 enterNeutralMode();
563 break;
564 case Expired:
565 // the connection got terminated because of session timeout
566 // call listener to reconnect
567 LOG.info("Session expired. Entering neutral mode and rejoining...");
568 enterNeutralMode();
569 reJoinElection(0);
570 break;
571 case SaslAuthenticated:
572 LOG.info("Successfully authenticated to ZooKeeper using SASL.");
573 break;
574 default:
575 fatalError("Unexpected Zookeeper watch event state: "
576 + event.getState());
577 break;
578 }
579
580 return;
581 }
582
583 // a watch on lock path in zookeeper has fired. so something has changed on
584 // the lock. ideally we should check that the path is the same as the lock
585 // path but trusting zookeeper for now
586 String path = event.getPath();
587 if (path != null) {
588 switch (eventType) {
589 case NodeDeleted:
590 if (state == State.ACTIVE) {
591 enterNeutralMode();
592 }
593 joinElectionInternal();
594 break;
595 case NodeDataChanged:
596 monitorActiveStatus();
597 break;
598 default:
599 LOG.debug("Unexpected node event: " + eventType + " for path: " + path);
600 monitorActiveStatus();
601 }
602
603 return;
604 }
605
606 // some unexpected error has occurred
607 fatalError("Unexpected watch error from Zookeeper");
608 }
609
610 /**
611 * Get a new zookeeper client instance. protected so that test class can
612 * inherit and pass in a mock object for zookeeper
613 *
614 * @return new zookeeper client instance
615 * @throws IOException
616 * @throws KeeperException zookeeper connectionloss exception
617 */
618 protected synchronized ZooKeeper getNewZooKeeper() throws IOException,
619 KeeperException {
620
621 // Unfortunately, the ZooKeeper constructor connects to ZooKeeper and
622 // may trigger the Connected event immediately. So, if we register the
623 // watcher after constructing ZooKeeper, we may miss that event. Instead,
624 // we construct the watcher first, and have it block any events it receives
625 // before we can set its ZooKeeper reference.
626 watcher = new WatcherWithClientRef();
627 ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher);
628 watcher.setZooKeeperRef(zk);
629
630 // Wait for the asynchronous success/failure. This may throw an exception
631 // if we don't connect within the session timeout.
632 watcher.waitForZKConnectionEvent(zkSessionTimeout);
633
634 for (ZKAuthInfo auth : zkAuthInfo) {
635 zk.addAuthInfo(auth.getScheme(), auth.getAuth());
636 }
637 return zk;
638 }
639
640 private void fatalError(String errorMessage) {
641 LOG.fatal(errorMessage);
642 reset();
643 appClient.notifyFatalError(errorMessage);
644 }
645
646 private void monitorActiveStatus() {
647 assert wantToBeInElection;
648 LOG.debug("Monitoring active leader for " + this);
649 statRetryCount = 0;
650 monitorLockNodeAsync();
651 }
652
653 private void joinElectionInternal() {
654 Preconditions.checkState(appData != null,
655 "trying to join election without any app data");
656 if (zkClient == null) {
657 if (!reEstablishSession()) {
658 fatalError("Failed to reEstablish connection with ZooKeeper");
659 return;
660 }
661 }
662
663 createRetryCount = 0;
664 wantToBeInElection = true;
665 createLockNodeAsync();
666 }
667
668 private void reJoinElection(int sleepTime) {
669 LOG.info("Trying to re-establish ZK session");
670
671 // Some of the test cases rely on expiring the ZK sessions and
672 // ensuring that the other node takes over. But, there's a race
673 // where the original lease holder could reconnect faster than the other
674 // thread manages to take the lock itself. This lock allows the
675 // tests to block the reconnection. It's a shame that this leaked
676 // into non-test code, but the lock is only acquired here so will never
677 // be contended.
678 sessionReestablishLockForTests.lock();
679 try {
680 terminateConnection();
681 sleepFor(sleepTime);
682 // Should not join election even before the SERVICE is reported
683 // as HEALTHY from ZKFC monitoring.
684 if (appData != null) {
685 joinElectionInternal();
686 } else {
687 LOG.info("Not joining election since service has not yet been " +
688 "reported as healthy.");
689 }
690 } finally {
691 sessionReestablishLockForTests.unlock();
692 }
693 }
694
695 /**
696 * Sleep for the given number of milliseconds.
697 * This is non-static, and separated out, so that unit tests
698 * can override the behavior not to sleep.
699 */
700 @VisibleForTesting
701 protected void sleepFor(int sleepMs) {
702 if (sleepMs > 0) {
703 try {
704 Thread.sleep(sleepMs);
705 } catch (InterruptedException e) {
706 Thread.currentThread().interrupt();
707 }
708 }
709 }
710
711 @VisibleForTesting
712 void preventSessionReestablishmentForTests() {
713 sessionReestablishLockForTests.lock();
714 }
715
716 @VisibleForTesting
717 void allowSessionReestablishmentForTests() {
718 sessionReestablishLockForTests.unlock();
719 }
720
721 @VisibleForTesting
722 synchronized long getZKSessionIdForTests() {
723 if (zkClient != null) {
724 return zkClient.getSessionId();
725 } else {
726 return -1;
727 }
728 }
729
730 @VisibleForTesting
731 synchronized State getStateForTests() {
732 return state;
733 }
734
735 private boolean reEstablishSession() {
736 int connectionRetryCount = 0;
737 boolean success = false;
738 while(!success && connectionRetryCount < NUM_RETRIES) {
739 LOG.debug("Establishing zookeeper connection for " + this);
740 try {
741 createConnection();
742 success = true;
743 } catch(IOException e) {
744 LOG.warn(e);
745 sleepFor(5000);
746 } catch(KeeperException e) {
747 LOG.warn(e);
748 sleepFor(5000);
749 }
750 ++connectionRetryCount;
751 }
752 return success;
753 }
754
755 private void createConnection() throws IOException, KeeperException {
756 if (zkClient != null) {
757 try {
758 zkClient.close();
759 } catch (InterruptedException e) {
760 throw new IOException("Interrupted while closing ZK",
761 e);
762 }
763 zkClient = null;
764 watcher = null;
765 }
766 zkClient = getNewZooKeeper();
767 LOG.debug("Created new connection for " + this);
768 }
769
770 @InterfaceAudience.Private
771 public synchronized void terminateConnection() {
772 if (zkClient == null) {
773 return;
774 }
775 LOG.debug("Terminating ZK connection for " + this);
776 ZooKeeper tempZk = zkClient;
777 zkClient = null;
778 watcher = null;
779 try {
780 tempZk.close();
781 } catch(InterruptedException e) {
782 LOG.warn(e);
783 }
784 zkConnectionState = ConnectionState.TERMINATED;
785 wantToBeInElection = false;
786 }
787
788 private void reset() {
789 state = State.INIT;
790 terminateConnection();
791 }
792
793 private boolean becomeActive() {
794 assert wantToBeInElection;
795 if (state == State.ACTIVE) {
796 // already active
797 return true;
798 }
799 try {
800 Stat oldBreadcrumbStat = fenceOldActive();
801 writeBreadCrumbNode(oldBreadcrumbStat);
802
803 LOG.debug("Becoming active for " + this);
804 appClient.becomeActive();
805 state = State.ACTIVE;
806 return true;
807 } catch (Exception e) {
808 LOG.warn("Exception handling the winning of election", e);
809 // Caller will handle quitting and rejoining the election.
810 return false;
811 }
812 }
813
814 /**
815 * Write the "ActiveBreadCrumb" node, indicating that this node may need
816 * to be fenced on failover.
817 * @param oldBreadcrumbStat
818 */
819 private void writeBreadCrumbNode(Stat oldBreadcrumbStat)
820 throws KeeperException, InterruptedException {
821 Preconditions.checkState(appData != null, "no appdata");
822
823 LOG.info("Writing znode " + zkBreadCrumbPath +
824 " to indicate that the local node is the most recent active...");
825 if (oldBreadcrumbStat == null) {
826 // No previous active, just create the node
827 createWithRetries(zkBreadCrumbPath, appData, zkAcl,
828 CreateMode.PERSISTENT);
829 } else {
830 // There was a previous active, update the node
831 setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion());
832 }
833 }
834
835 /**
836 * Try to delete the "ActiveBreadCrumb" node when gracefully giving up
837 * active status.
838 * If this fails, it will simply warn, since the graceful release behavior
839 * is only an optimization.
840 */
841 private void tryDeleteOwnBreadCrumbNode() {
842 assert state == State.ACTIVE;
843 LOG.info("Deleting bread-crumb of active node...");
844
845 // Sanity check the data. This shouldn't be strictly necessary,
846 // but better to play it safe.
847 Stat stat = new Stat();
848 byte[] data = null;
849 try {
850 data = zkClient.getData(zkBreadCrumbPath, false, stat);
851
852 if (!Arrays.equals(data, appData)) {
853 throw new IllegalStateException(
854 "We thought we were active, but in fact " +
855 "the active znode had the wrong data: " +
856 StringUtils.byteToHexString(data) + " (stat=" + stat + ")");
857 }
858
859 deleteWithRetries(zkBreadCrumbPath, stat.getVersion());
860 } catch (Exception e) {
861 LOG.warn("Unable to delete our own bread-crumb of being active at " +
862 zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " +
863 "Expecting to be fenced by the next active.");
864 }
865 }
866
867 /**
868 * If there is a breadcrumb node indicating that another node may need
869 * fencing, try to fence that node.
870 * @return the Stat of the breadcrumb node that was read, or null
871 * if no breadcrumb node existed
872 */
873 private Stat fenceOldActive() throws InterruptedException, KeeperException {
874 final Stat stat = new Stat();
875 byte[] data;
876 LOG.info("Checking for any old active which needs to be fenced...");
877 try {
878 data = zkDoWithRetries(new ZKAction<byte[]>() {
879 @Override
880 public byte[] run() throws KeeperException, InterruptedException {
881 return zkClient.getData(zkBreadCrumbPath, false, stat);
882 }
883 });
884 } catch (KeeperException ke) {
885 if (isNodeDoesNotExist(ke.code())) {
886 LOG.info("No old node to fence");
887 return null;
888 }
889
890 // If we failed to read for any other reason, then likely we lost
891 // our session, or we don't have permissions, etc. In any case,
892 // we probably shouldn't become active, and failing the whole
893 // thing is the best bet.
894 throw ke;
895 }
896
897 LOG.info("Old node exists: " + StringUtils.byteToHexString(data));
898 if (Arrays.equals(data, appData)) {
899 LOG.info("But old node has our own data, so don't need to fence it.");
900 } else {
901 appClient.fenceOldActive(data);
902 }
903 return stat;
904 }
905
906 private void becomeStandby() {
907 if (state != State.STANDBY) {
908 LOG.debug("Becoming standby for " + this);
909 state = State.STANDBY;
910 appClient.becomeStandby();
911 }
912 }
913
914 private void enterNeutralMode() {
915 if (state != State.NEUTRAL) {
916 LOG.debug("Entering neutral mode for " + this);
917 state = State.NEUTRAL;
918 appClient.enterNeutralMode();
919 }
920 }
921
922 private void createLockNodeAsync() {
923 zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL,
924 this, zkClient);
925 }
926
927 private void monitorLockNodeAsync() {
928 zkClient.exists(zkLockFilePath,
929 watcher, this,
930 zkClient);
931 }
932
933 private String createWithRetries(final String path, final byte[] data,
934 final List<ACL> acl, final CreateMode mode)
935 throws InterruptedException, KeeperException {
936 return zkDoWithRetries(new ZKAction<String>() {
937 @Override
938 public String run() throws KeeperException, InterruptedException {
939 return zkClient.create(path, data, acl, mode);
940 }
941 });
942 }
943
944 private byte[] getDataWithRetries(final String path, final boolean watch,
945 final Stat stat) throws InterruptedException, KeeperException {
946 return zkDoWithRetries(new ZKAction<byte[]>() {
947 @Override
948 public byte[] run() throws KeeperException, InterruptedException {
949 return zkClient.getData(path, watch, stat);
950 }
951 });
952 }
953
954 private Stat setDataWithRetries(final String path, final byte[] data,
955 final int version) throws InterruptedException, KeeperException {
956 return zkDoWithRetries(new ZKAction<Stat>() {
957 @Override
958 public Stat run() throws KeeperException, InterruptedException {
959 return zkClient.setData(path, data, version);
960 }
961 });
962 }
963
964 private void deleteWithRetries(final String path, final int version)
965 throws KeeperException, InterruptedException {
966 zkDoWithRetries(new ZKAction<Void>() {
967 @Override
968 public Void run() throws KeeperException, InterruptedException {
969 zkClient.delete(path, version);
970 return null;
971 }
972 });
973 }
974
975 private static <T> T zkDoWithRetries(ZKAction<T> action)
976 throws KeeperException, InterruptedException {
977 int retry = 0;
978 while (true) {
979 try {
980 return action.run();
981 } catch (KeeperException ke) {
982 if (shouldRetry(ke.code()) && ++retry < NUM_RETRIES) {
983 continue;
984 }
985 throw ke;
986 }
987 }
988 }
989
990 private interface ZKAction<T> {
991 T run() throws KeeperException, InterruptedException;
992 }
993
994 /**
995 * The callbacks and watchers pass a reference to the ZK client
996 * which made the original call. We don't want to take action
997 * based on any callbacks from prior clients after we quit
998 * the election.
999 * @param ctx the ZK client passed into the watcher
1000 * @return true if it matches the current client
1001 */
1002 private synchronized boolean isStaleClient(Object ctx) {
1003 Preconditions.checkNotNull(ctx);
1004 if (zkClient != (ZooKeeper)ctx) {
1005 LOG.warn("Ignoring stale result from old client with sessionId " +
1006 String.format("0x%08x", ((ZooKeeper)ctx).getSessionId()));
1007 return true;
1008 }
1009 return false;
1010 }
1011
1012 /**
1013 * Watcher implementation which keeps a reference around to the
1014 * original ZK connection, and passes it back along with any
1015 * events.
1016 */
1017 private final class WatcherWithClientRef implements Watcher {
1018 private ZooKeeper zk;
1019
1020 /**
1021 * Latch fired whenever any event arrives. This is used in order
1022 * to wait for the Connected event when the client is first created.
1023 */
1024 private CountDownLatch hasReceivedEvent = new CountDownLatch(1);
1025
1026 /**
1027 * Latch used to wait until the reference to ZooKeeper is set.
1028 */
1029 private CountDownLatch hasSetZooKeeper = new CountDownLatch(1);
1030
1031 /**
1032 * Waits for the next event from ZooKeeper to arrive.
1033 *
1034 * @param connectionTimeoutMs zookeeper connection timeout in milliseconds
1035 * @throws KeeperException if the connection attempt times out. This will
1036 * be a ZooKeeper ConnectionLoss exception code.
1037 * @throws IOException if interrupted while connecting to ZooKeeper
1038 */
1039 private void waitForZKConnectionEvent(int connectionTimeoutMs)
1040 throws KeeperException, IOException {
1041 try {
1042 if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) {
1043 LOG.error("Connection timed out: couldn't connect to ZooKeeper in "
1044 + connectionTimeoutMs + " milliseconds");
1045 zk.close();
1046 throw KeeperException.create(Code.CONNECTIONLOSS);
1047 }
1048 } catch (InterruptedException e) {
1049 Thread.currentThread().interrupt();
1050 throw new IOException(
1051 "Interrupted when connecting to zookeeper server", e);
1052 }
1053 }
1054
1055 private void setZooKeeperRef(ZooKeeper zk) {
1056 Preconditions.checkState(this.zk == null,
1057 "zk already set -- must be set exactly once");
1058 this.zk = zk;
1059 hasSetZooKeeper.countDown();
1060 }
1061
1062 @Override
1063 public void process(WatchedEvent event) {
1064 hasReceivedEvent.countDown();
1065 try {
1066 hasSetZooKeeper.await(zkSessionTimeout, TimeUnit.MILLISECONDS);
1067 ActiveStandbyElector.this.processWatchEvent(
1068 zk, event);
1069 } catch (Throwable t) {
1070 fatalError(
1071 "Failed to process watcher event " + event + ": " +
1072 StringUtils.stringifyException(t));
1073 }
1074 }
1075 }
1076
1077 private static boolean isSuccess(Code code) {
1078 return (code == Code.OK);
1079 }
1080
1081 private static boolean isNodeExists(Code code) {
1082 return (code == Code.NODEEXISTS);
1083 }
1084
1085 private static boolean isNodeDoesNotExist(Code code) {
1086 return (code == Code.NONODE);
1087 }
1088
1089 private static boolean isSessionExpired(Code code) {
1090 return (code == Code.SESSIONEXPIRED);
1091 }
1092
1093 private static boolean shouldRetry(Code code) {
1094 switch (code) {
1095 case CONNECTIONLOSS:
1096 case OPERATIONTIMEOUT:
1097 return true;
1098 }
1099 return false;
1100 }
1101
1102 @Override
1103 public String toString() {
1104 return "elector id=" + System.identityHashCode(this) +
1105 " appData=" +
1106 ((appData == null) ? "null" : StringUtils.byteToHexString(appData)) +
1107 " cb=" + appClient;
1108 }
1109 }