001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.ha;
019
020 import java.io.IOException;
021 import java.io.PrintStream;
022 import java.util.Arrays;
023 import java.util.Map;
024
025 import org.apache.commons.cli.Options;
026 import org.apache.commons.cli.CommandLine;
027 import org.apache.commons.cli.GnuParser;
028 import org.apache.commons.cli.ParseException;
029 import org.apache.commons.logging.Log;
030 import org.apache.commons.logging.LogFactory;
031
032 import org.apache.hadoop.classification.InterfaceAudience;
033 import org.apache.hadoop.conf.Configuration;
034 import org.apache.hadoop.conf.Configured;
035 import org.apache.hadoop.fs.CommonConfigurationKeys;
036 import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
037 import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
038 import org.apache.hadoop.util.Tool;
039 import org.apache.hadoop.util.ToolRunner;
040
041 import com.google.common.base.Preconditions;
042 import com.google.common.collect.ImmutableMap;
043
044 /**
045 * A command-line tool for making calls in the HAServiceProtocol.
046 * For example,. this can be used to force a service to standby or active
047 * mode, or to trigger a health-check.
048 */
049 @InterfaceAudience.Private
050
051 public abstract class HAAdmin extends Configured implements Tool {
052
053 private static final String FORCEFENCE = "forcefence";
054 private static final String FORCEACTIVE = "forceactive";
055
056 /**
057 * Undocumented flag which allows an administrator to use manual failover
058 * state transitions even when auto-failover is enabled. This is an unsafe
059 * operation, which is why it is not documented in the usage below.
060 */
061 private static final String FORCEMANUAL = "forcemanual";
062 private static final Log LOG = LogFactory.getLog(HAAdmin.class);
063
064 private int rpcTimeoutForChecks = -1;
065
066 protected final static Map<String, UsageInfo> USAGE =
067 ImmutableMap.<String, UsageInfo>builder()
068 .put("-transitionToActive",
069 new UsageInfo("<serviceId>", "Transitions the service into Active state"))
070 .put("-transitionToStandby",
071 new UsageInfo("<serviceId>", "Transitions the service into Standby state"))
072 .put("-failover",
073 new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>",
074 "Failover from the first service to the second.\n" +
075 "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" +
076 "Try to failover to the target service even if it is not ready if the " +
077 FORCEACTIVE + " option is used."))
078 .put("-getServiceState",
079 new UsageInfo("<serviceId>", "Returns the state of the service"))
080 .put("-checkHealth",
081 new UsageInfo("<serviceId>",
082 "Requests that the service perform a health check.\n" +
083 "The HAAdmin tool will exit with a non-zero exit code\n" +
084 "if the check fails."))
085 .put("-help",
086 new UsageInfo("<command>", "Displays help on the specified command"))
087 .build();
088
089 /** Output stream for errors, for use in tests */
090 protected PrintStream errOut = System.err;
091 protected PrintStream out = System.out;
092 private RequestSource requestSource = RequestSource.REQUEST_BY_USER;
093
094 protected HAAdmin() {
095 super();
096 }
097
098 protected HAAdmin(Configuration conf) {
099 super(conf);
100 }
101
102 protected abstract HAServiceTarget resolveTarget(String string);
103
104 protected String getUsageString() {
105 return "Usage: HAAdmin";
106 }
107
108 protected void printUsage(PrintStream errOut) {
109 errOut.println(getUsageString());
110 for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) {
111 String cmd = e.getKey();
112 UsageInfo usage = e.getValue();
113
114 errOut.println(" [" + cmd + " " + usage.args + "]");
115 }
116 errOut.println();
117 ToolRunner.printGenericCommandUsage(errOut);
118 }
119
120 private static void printUsage(PrintStream errOut, String cmd) {
121 UsageInfo usage = USAGE.get(cmd);
122 if (usage == null) {
123 throw new RuntimeException("No usage for cmd " + cmd);
124 }
125 errOut.println("Usage: HAAdmin [" + cmd + " " + usage.args + "]");
126 }
127
128 private int transitionToActive(final CommandLine cmd)
129 throws IOException, ServiceFailedException {
130 String[] argv = cmd.getArgs();
131 if (argv.length != 1) {
132 errOut.println("transitionToActive: incorrect number of arguments");
133 printUsage(errOut, "-transitionToActive");
134 return -1;
135 }
136 HAServiceTarget target = resolveTarget(argv[0]);
137 if (!checkManualStateManagementOK(target)) {
138 return -1;
139 }
140 HAServiceProtocol proto = target.getProxy(
141 getConf(), 0);
142 HAServiceProtocolHelper.transitionToActive(proto, createReqInfo());
143 return 0;
144 }
145
146 private int transitionToStandby(final CommandLine cmd)
147 throws IOException, ServiceFailedException {
148 String[] argv = cmd.getArgs();
149 if (argv.length != 1) {
150 errOut.println("transitionToStandby: incorrect number of arguments");
151 printUsage(errOut, "-transitionToStandby");
152 return -1;
153 }
154
155 HAServiceTarget target = resolveTarget(argv[0]);
156 if (!checkManualStateManagementOK(target)) {
157 return -1;
158 }
159 HAServiceProtocol proto = target.getProxy(
160 getConf(), 0);
161 HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo());
162 return 0;
163 }
164 /**
165 * Ensure that we are allowed to manually manage the HA state of the target
166 * service. If automatic failover is configured, then the automatic
167 * failover controllers should be doing state management, and it is generally
168 * an error to use the HAAdmin command line to do so.
169 *
170 * @param target the target to check
171 * @return true if manual state management is allowed
172 */
173 private boolean checkManualStateManagementOK(HAServiceTarget target) {
174 if (target.isAutoFailoverEnabled()) {
175 if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) {
176 errOut.println(
177 "Automatic failover is enabled for " + target + "\n" +
178 "Refusing to manually manage HA state, since it may cause\n" +
179 "a split-brain scenario or other incorrect state.\n" +
180 "If you are very sure you know what you are doing, please \n" +
181 "specify the " + FORCEMANUAL + " flag.");
182 return false;
183 } else {
184 LOG.warn("Proceeding with manual HA state management even though\n" +
185 "automatic failover is enabled for " + target);
186 return true;
187 }
188 }
189 return true;
190 }
191
192 private StateChangeRequestInfo createReqInfo() {
193 return new StateChangeRequestInfo(requestSource);
194 }
195
196 private int failover(CommandLine cmd)
197 throws IOException, ServiceFailedException {
198 boolean forceFence = cmd.hasOption(FORCEFENCE);
199 boolean forceActive = cmd.hasOption(FORCEACTIVE);
200
201 int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length;
202 final String[] args = cmd.getArgs();
203
204 if (numOpts > 3 || args.length != 2) {
205 errOut.println("failover: incorrect arguments");
206 printUsage(errOut, "-failover");
207 return -1;
208 }
209
210 HAServiceTarget fromNode = resolveTarget(args[0]);
211 HAServiceTarget toNode = resolveTarget(args[1]);
212
213 // Check that auto-failover is consistently configured for both nodes.
214 Preconditions.checkState(
215 fromNode.isAutoFailoverEnabled() ==
216 toNode.isAutoFailoverEnabled(),
217 "Inconsistent auto-failover configs between %s and %s!",
218 fromNode, toNode);
219
220 if (fromNode.isAutoFailoverEnabled()) {
221 if (forceFence || forceActive) {
222 // -forceActive doesn't make sense with auto-HA, since, if the node
223 // is not healthy, then its ZKFC will immediately quit the election
224 // again the next time a health check runs.
225 //
226 // -forceFence doesn't seem to have any real use cases with auto-HA
227 // so it isn't implemented.
228 errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " +
229 "supported with auto-failover enabled.");
230 return -1;
231 }
232 return gracefulFailoverThroughZKFCs(toNode);
233 }
234
235 FailoverController fc = new FailoverController(getConf(),
236 requestSource);
237
238 try {
239 fc.failover(fromNode, toNode, forceFence, forceActive);
240 out.println("Failover from "+args[0]+" to "+args[1]+" successful");
241 } catch (FailoverFailedException ffe) {
242 errOut.println("Failover failed: " + ffe.getLocalizedMessage());
243 return -1;
244 }
245 return 0;
246 }
247
248
249 /**
250 * Initiate a graceful failover by talking to the target node's ZKFC.
251 * This sends an RPC to the ZKFC, which coordinates the failover.
252 *
253 * @param toNode the node to fail to
254 * @return status code (0 for success)
255 * @throws IOException if failover does not succeed
256 */
257 private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode)
258 throws IOException {
259
260 int timeout = FailoverController.getRpcTimeoutToNewActive(getConf());
261 ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout);
262 try {
263 proxy.gracefulFailover();
264 out.println("Failover to " + toNode + " successful");
265 } catch (ServiceFailedException sfe) {
266 errOut.println("Failover failed: " + sfe.getLocalizedMessage());
267 return -1;
268 }
269
270 return 0;
271 }
272
273 private int checkHealth(final CommandLine cmd)
274 throws IOException, ServiceFailedException {
275 String[] argv = cmd.getArgs();
276 if (argv.length != 1) {
277 errOut.println("checkHealth: incorrect number of arguments");
278 printUsage(errOut, "-checkHealth");
279 return -1;
280 }
281 HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
282 getConf(), rpcTimeoutForChecks);
283 try {
284 HAServiceProtocolHelper.monitorHealth(proto, createReqInfo());
285 } catch (HealthCheckFailedException e) {
286 errOut.println("Health check failed: " + e.getLocalizedMessage());
287 return -1;
288 }
289 return 0;
290 }
291
292 private int getServiceState(final CommandLine cmd)
293 throws IOException, ServiceFailedException {
294 String[] argv = cmd.getArgs();
295 if (argv.length != 1) {
296 errOut.println("getServiceState: incorrect number of arguments");
297 printUsage(errOut, "-getServiceState");
298 return -1;
299 }
300
301 HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
302 getConf(), rpcTimeoutForChecks);
303 out.println(proto.getServiceStatus().getState());
304 return 0;
305 }
306
307 /**
308 * Return the serviceId as is, we are assuming it was
309 * given as a service address of form <host:ipcport>.
310 */
311 protected String getServiceAddr(String serviceId) {
312 return serviceId;
313 }
314
315 @Override
316 public void setConf(Configuration conf) {
317 super.setConf(conf);
318 if (conf != null) {
319 rpcTimeoutForChecks = conf.getInt(
320 CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
321 CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
322 }
323 }
324
325 @Override
326 public int run(String[] argv) throws Exception {
327 try {
328 return runCmd(argv);
329 } catch (IllegalArgumentException iae) {
330 errOut.println("Illegal argument: " + iae.getLocalizedMessage());
331 return -1;
332 } catch (IOException ioe) {
333 errOut.println("Operation failed: " + ioe.getLocalizedMessage());
334 if (LOG.isDebugEnabled()) {
335 LOG.debug("Operation failed", ioe);
336 }
337 return -1;
338 }
339 }
340
341 protected int runCmd(String[] argv) throws Exception {
342 if (argv.length < 1) {
343 printUsage(errOut);
344 return -1;
345 }
346
347 String cmd = argv[0];
348
349 if (!cmd.startsWith("-")) {
350 errOut.println("Bad command '" + cmd + "': expected command starting with '-'");
351 printUsage(errOut);
352 return -1;
353 }
354
355 if (!USAGE.containsKey(cmd)) {
356 errOut.println(cmd.substring(1) + ": Unknown command");
357 printUsage(errOut);
358 return -1;
359 }
360
361 Options opts = new Options();
362
363 // Add command-specific options
364 if ("-failover".equals(cmd)) {
365 addFailoverCliOpts(opts);
366 }
367 // Mutative commands take FORCEMANUAL option
368 if ("-transitionToActive".equals(cmd) ||
369 "-transitionToStandby".equals(cmd) ||
370 "-failover".equals(cmd)) {
371 opts.addOption(FORCEMANUAL, false,
372 "force manual control even if auto-failover is enabled");
373 }
374
375 CommandLine cmdLine = parseOpts(cmd, opts, argv);
376 if (cmdLine == null) {
377 // error already printed
378 return -1;
379 }
380
381 if (cmdLine.hasOption(FORCEMANUAL)) {
382 if (!confirmForceManual()) {
383 LOG.fatal("Aborted");
384 return -1;
385 }
386 // Instruct the NNs to honor this request even if they're
387 // configured for manual failover.
388 requestSource = RequestSource.REQUEST_BY_USER_FORCED;
389 }
390
391 if ("-transitionToActive".equals(cmd)) {
392 return transitionToActive(cmdLine);
393 } else if ("-transitionToStandby".equals(cmd)) {
394 return transitionToStandby(cmdLine);
395 } else if ("-failover".equals(cmd)) {
396 return failover(cmdLine);
397 } else if ("-getServiceState".equals(cmd)) {
398 return getServiceState(cmdLine);
399 } else if ("-checkHealth".equals(cmd)) {
400 return checkHealth(cmdLine);
401 } else if ("-help".equals(cmd)) {
402 return help(argv);
403 } else {
404 // we already checked command validity above, so getting here
405 // would be a coding error
406 throw new AssertionError("Should not get here, command: " + cmd);
407 }
408 }
409
410 private boolean confirmForceManual() throws IOException {
411 return ToolRunner.confirmPrompt(
412 "You have specified the " + FORCEMANUAL + " flag. This flag is " +
413 "dangerous, as it can induce a split-brain scenario that WILL " +
414 "CORRUPT your HDFS namespace, possibly irrecoverably.\n" +
415 "\n" +
416 "It is recommended not to use this flag, but instead to shut down the " +
417 "cluster and disable automatic failover if you prefer to manually " +
418 "manage your HA state.\n" +
419 "\n" +
420 "You may abort safely by answering 'n' or hitting ^C now.\n" +
421 "\n" +
422 "Are you sure you want to continue?");
423 }
424
425 /**
426 * Add CLI options which are specific to the failover command and no
427 * others.
428 */
429 private void addFailoverCliOpts(Options failoverOpts) {
430 failoverOpts.addOption(FORCEFENCE, false, "force fencing");
431 failoverOpts.addOption(FORCEACTIVE, false, "force failover");
432 // Don't add FORCEMANUAL, since that's added separately for all commands
433 // that change state.
434 }
435
436 private CommandLine parseOpts(String cmdName, Options opts, String[] argv) {
437 try {
438 // Strip off the first arg, since that's just the command name
439 argv = Arrays.copyOfRange(argv, 1, argv.length);
440 return new GnuParser().parse(opts, argv);
441 } catch (ParseException pe) {
442 errOut.println(cmdName.substring(1) +
443 ": incorrect arguments");
444 printUsage(errOut, cmdName);
445 return null;
446 }
447 }
448
449 private int help(String[] argv) {
450 if (argv.length == 1) { // only -help
451 printUsage(out);
452 return 0;
453 } else if (argv.length != 2) {
454 printUsage(errOut, "-help");
455 return -1;
456 }
457 String cmd = argv[1];
458 if (!cmd.startsWith("-")) {
459 cmd = "-" + cmd;
460 }
461 UsageInfo usageInfo = USAGE.get(cmd);
462 if (usageInfo == null) {
463 errOut.println(cmd + ": Unknown command");
464 printUsage(errOut);
465 return -1;
466 }
467
468 out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
469 return 0;
470 }
471
472 protected static class UsageInfo {
473 public final String args;
474 public final String help;
475
476 public UsageInfo(String args, String help) {
477 this.args = args;
478 this.help = help;
479 }
480 }
481 }