001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.ha;
019
020 import java.io.IOException;
021 import java.net.InetSocketAddress;
022 import java.util.Collection;
023 import java.util.regex.Matcher;
024 import java.util.regex.Pattern;
025
026 import org.apache.commons.logging.Log;
027 import org.apache.commons.logging.LogFactory;
028 import org.apache.hadoop.conf.Configured;
029
030 import com.google.common.annotations.VisibleForTesting;
031 import com.jcraft.jsch.ChannelExec;
032 import com.jcraft.jsch.JSch;
033 import com.jcraft.jsch.JSchException;
034 import com.jcraft.jsch.Session;
035
036 /**
037 * This fencing implementation sshes to the target node and uses
038 * <code>fuser</code> to kill the process listening on the service's
039 * TCP port. This is more accurate than using "jps" since it doesn't
040 * require parsing, and will work even if there are multiple service
041 * processes running on the same machine.<p>
042 * It returns a successful status code if:
043 * <ul>
044 * <li><code>fuser</code> indicates it successfully killed a process, <em>or</em>
045 * <li><code>nc -z</code> indicates that nothing is listening on the target port
046 * </ul>
047 * <p>
048 * This fencing mechanism is configured as following in the fencing method
049 * list:
050 * <code>sshfence([[username][:ssh-port]])</code>
051 * where the optional argument specifies the username and port to use
052 * with ssh.
053 * <p>
054 * In order to achieve passwordless SSH, the operator must also configure
055 * <code>dfs.ha.fencing.ssh.private-key-files<code> to point to an
056 * SSH key that has passphrase-less access to the given username and host.
057 */
058 public class SshFenceByTcpPort extends Configured
059 implements FenceMethod {
060
061 static final Log LOG = LogFactory.getLog(
062 SshFenceByTcpPort.class);
063
064 static final String CONF_CONNECT_TIMEOUT_KEY =
065 "dfs.ha.fencing.ssh.connect-timeout";
066 private static final int CONF_CONNECT_TIMEOUT_DEFAULT =
067 30*1000;
068 static final String CONF_IDENTITIES_KEY =
069 "dfs.ha.fencing.ssh.private-key-files";
070
071 /**
072 * Verify that the argument, if given, in the conf is parseable.
073 */
074 @Override
075 public void checkArgs(String argStr) throws BadFencingConfigurationException {
076 if (argStr != null) {
077 new Args(argStr);
078 }
079 }
080
081 @Override
082 public boolean tryFence(HAServiceTarget target, String argsStr)
083 throws BadFencingConfigurationException {
084
085 Args args = new Args(argsStr);
086 InetSocketAddress serviceAddr = target.getAddress();
087 String host = serviceAddr.getHostName();
088
089 Session session;
090 try {
091 session = createSession(serviceAddr.getHostName(), args);
092 } catch (JSchException e) {
093 LOG.warn("Unable to create SSH session", e);
094 return false;
095 }
096
097 LOG.info("Connecting to " + host + "...");
098
099 try {
100 session.connect(getSshConnectTimeout());
101 } catch (JSchException e) {
102 LOG.warn("Unable to connect to " + host
103 + " as user " + args.user, e);
104 return false;
105 }
106 LOG.info("Connected to " + host);
107
108 try {
109 return doFence(session, serviceAddr);
110 } catch (JSchException e) {
111 LOG.warn("Unable to achieve fencing on remote host", e);
112 return false;
113 } finally {
114 session.disconnect();
115 }
116 }
117
118
119 private Session createSession(String host, Args args) throws JSchException {
120 JSch jsch = new JSch();
121 for (String keyFile : getKeyFiles()) {
122 jsch.addIdentity(keyFile);
123 }
124 JSch.setLogger(new LogAdapter());
125
126 Session session = jsch.getSession(args.user, host, args.sshPort);
127 session.setConfig("StrictHostKeyChecking", "no");
128 return session;
129 }
130
131 private boolean doFence(Session session, InetSocketAddress serviceAddr)
132 throws JSchException {
133 int port = serviceAddr.getPort();
134 try {
135 LOG.info("Looking for process running on port " + port);
136 int rc = execCommand(session,
137 "PATH=$PATH:/sbin:/usr/sbin fuser -v -k -n tcp " + port);
138 if (rc == 0) {
139 LOG.info("Successfully killed process that was " +
140 "listening on port " + port);
141 // exit code 0 indicates the process was successfully killed.
142 return true;
143 } else if (rc == 1) {
144 // exit code 1 indicates either that the process was not running
145 // or that fuser didn't have root privileges in order to find it
146 // (eg running as a different user)
147 LOG.info(
148 "Indeterminate response from trying to kill service. " +
149 "Verifying whether it is running using nc...");
150 rc = execCommand(session, "nc -z " + serviceAddr.getHostName() +
151 " " + serviceAddr.getPort());
152 if (rc == 0) {
153 // the service is still listening - we are unable to fence
154 LOG.warn("Unable to fence - it is running but we cannot kill it");
155 return false;
156 } else {
157 LOG.info("Verified that the service is down.");
158 return true;
159 }
160 } else {
161 // other
162 }
163 LOG.info("rc: " + rc);
164 return rc == 0;
165 } catch (InterruptedException e) {
166 LOG.warn("Interrupted while trying to fence via ssh", e);
167 return false;
168 } catch (IOException e) {
169 LOG.warn("Unknown failure while trying to fence via ssh", e);
170 return false;
171 }
172 }
173
174 /**
175 * Execute a command through the ssh session, pumping its
176 * stderr and stdout to our own logs.
177 */
178 private int execCommand(Session session, String cmd)
179 throws JSchException, InterruptedException, IOException {
180 LOG.debug("Running cmd: " + cmd);
181 ChannelExec exec = null;
182 try {
183 exec = (ChannelExec)session.openChannel("exec");
184 exec.setCommand(cmd);
185 exec.setInputStream(null);
186 exec.connect();
187
188 // Pump stdout of the command to our WARN logs
189 StreamPumper outPumper = new StreamPumper(LOG, cmd + " via ssh",
190 exec.getInputStream(), StreamPumper.StreamType.STDOUT);
191 outPumper.start();
192
193 // Pump stderr of the command to our WARN logs
194 StreamPumper errPumper = new StreamPumper(LOG, cmd + " via ssh",
195 exec.getErrStream(), StreamPumper.StreamType.STDERR);
196 errPumper.start();
197
198 outPumper.join();
199 errPumper.join();
200 return exec.getExitStatus();
201 } finally {
202 cleanup(exec);
203 }
204 }
205
206 private static void cleanup(ChannelExec exec) {
207 if (exec != null) {
208 try {
209 exec.disconnect();
210 } catch (Throwable t) {
211 LOG.warn("Couldn't disconnect ssh channel", t);
212 }
213 }
214 }
215
216 private int getSshConnectTimeout() {
217 return getConf().getInt(
218 CONF_CONNECT_TIMEOUT_KEY, CONF_CONNECT_TIMEOUT_DEFAULT);
219 }
220
221 private Collection<String> getKeyFiles() {
222 return getConf().getTrimmedStringCollection(CONF_IDENTITIES_KEY);
223 }
224
225 /**
226 * Container for the parsed arg line for this fencing method.
227 */
228 @VisibleForTesting
229 static class Args {
230 private static final Pattern USER_PORT_RE = Pattern.compile(
231 "([^:]+?)?(?:\\:(\\d+))?");
232
233 private static final int DEFAULT_SSH_PORT = 22;
234
235 String user;
236 int sshPort;
237
238 public Args(String arg)
239 throws BadFencingConfigurationException {
240 user = System.getProperty("user.name");
241 sshPort = DEFAULT_SSH_PORT;
242
243 // Parse optional user and ssh port
244 if (arg != null && !arg.isEmpty()) {
245 Matcher m = USER_PORT_RE.matcher(arg);
246 if (!m.matches()) {
247 throw new BadFencingConfigurationException(
248 "Unable to parse user and SSH port: "+ arg);
249 }
250 if (m.group(1) != null) {
251 user = m.group(1);
252 }
253 if (m.group(2) != null) {
254 sshPort = parseConfiggedPort(m.group(2));
255 }
256 }
257 }
258
259 private Integer parseConfiggedPort(String portStr)
260 throws BadFencingConfigurationException {
261 try {
262 return Integer.valueOf(portStr);
263 } catch (NumberFormatException nfe) {
264 throw new BadFencingConfigurationException(
265 "Port number '" + portStr + "' invalid");
266 }
267 }
268 }
269
270 /**
271 * Adapter from JSch's logger interface to our log4j
272 */
273 private static class LogAdapter implements com.jcraft.jsch.Logger {
274 static final Log LOG = LogFactory.getLog(
275 SshFenceByTcpPort.class.getName() + ".jsch");
276
277 @Override
278 public boolean isEnabled(int level) {
279 switch (level) {
280 case com.jcraft.jsch.Logger.DEBUG:
281 return LOG.isDebugEnabled();
282 case com.jcraft.jsch.Logger.INFO:
283 return LOG.isInfoEnabled();
284 case com.jcraft.jsch.Logger.WARN:
285 return LOG.isWarnEnabled();
286 case com.jcraft.jsch.Logger.ERROR:
287 return LOG.isErrorEnabled();
288 case com.jcraft.jsch.Logger.FATAL:
289 return LOG.isFatalEnabled();
290 default:
291 return false;
292 }
293 }
294
295 @Override
296 public void log(int level, String message) {
297 switch (level) {
298 case com.jcraft.jsch.Logger.DEBUG:
299 LOG.debug(message);
300 break;
301 case com.jcraft.jsch.Logger.INFO:
302 LOG.info(message);
303 break;
304 case com.jcraft.jsch.Logger.WARN:
305 LOG.warn(message);
306 break;
307 case com.jcraft.jsch.Logger.ERROR:
308 LOG.error(message);
309 break;
310 case com.jcraft.jsch.Logger.FATAL:
311 LOG.fatal(message);
312 break;
313 }
314 }
315 }
316 }