1 /**
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19 package org.apache.hadoop.hbase.master;
20
21 import java.io.IOException;
22
23 import org.apache.commons.logging.Log;
24 import org.apache.commons.logging.LogFactory;
25 import org.apache.hadoop.hbase.HBaseTestingUtility;
26 import org.apache.hadoop.hbase.HConstants;
27 import org.apache.hadoop.hbase.HRegionInfo;
28 import org.apache.hadoop.hbase.TableName;
29 import org.apache.hadoop.hbase.client.Durability;
30 import org.apache.hadoop.hbase.client.HTable;
31 import org.apache.hadoop.hbase.client.Put;
32 import org.apache.hadoop.hbase.client.RegionLocator;
33 import org.apache.hadoop.hbase.client.Result;
34 import org.apache.hadoop.hbase.client.ResultScanner;
35 import org.apache.hadoop.hbase.client.Scan;
36 import org.apache.hadoop.hbase.client.Table;
37 import org.apache.hadoop.hbase.testclassification.LargeTests;
38 import org.apache.hadoop.hbase.util.Bytes;
39 import org.junit.AfterClass;
40 import org.junit.Assert;
41 import org.junit.Before;
42 import org.junit.BeforeClass;
43 import org.junit.Ignore;
44 import org.junit.Test;
45 import org.junit.experimental.categories.Category;
46
47 /**
48 * Test transitions of state across the master. Sets up the cluster once and
49 * then runs a couple of tests.
50 */
51 @Category(LargeTests.class)
52 public class TestMasterTransitions {
53 private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class);
54 private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
55 private static final TableName TABLENAME = TableName.valueOf("master_transitions");
56 private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"),
57 Bytes.toBytes("b"), Bytes.toBytes("c")};
58
59 /**
60 * Start up a mini cluster and put a small table of many empty regions into it.
61 * @throws Exception
62 */
63 @BeforeClass public static void beforeAllTests() throws Exception {
64 TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true);
65 TEST_UTIL.startMiniCluster(2);
66 // Create a table of three families. This will assign a region.
67 TEST_UTIL.createMultiRegionTable(TABLENAME, FAMILIES);
68 HTable t = (HTable) TEST_UTIL.getConnection().getTable(TABLENAME);
69 int countOfRegions = -1;
70 try (RegionLocator r = t.getRegionLocator()) {
71 countOfRegions = r.getStartKeys().length;
72 }
73 TEST_UTIL.waitUntilAllRegionsAssigned(TABLENAME);
74 addToEachStartKey(countOfRegions);
75 t.close();
76 }
77
78 @AfterClass public static void afterAllTests() throws Exception {
79 TEST_UTIL.shutdownMiniCluster();
80 }
81
82 @Before public void setup() throws IOException {
83 TEST_UTIL.ensureSomeRegionServersAvailable(2);
84 }
85
86 /**
87 * Listener for regionserver events testing hbase-2428 (Infinite loop of
88 * region closes if hbase:meta region is offline). In particular, listen
89 * for the close of the 'metaServer' and when it comes in, requeue it with a
90 * delay as though there were an issue processing the shutdown. As part of
91 * the requeuing, send over a close of a region on 'otherServer' so it comes
92 * into a master that has its meta region marked as offline.
93 */
94 /*
95 static class HBase2428Listener implements RegionServerOperationListener {
96 // Map of what we've delayed so we don't do do repeated delays.
97 private final Set<RegionServerOperation> postponed =
98 new CopyOnWriteArraySet<RegionServerOperation>();
99 private boolean done = false;;
100 private boolean metaShutdownReceived = false;
101 private final HServerAddress metaAddress;
102 private final MiniHBaseCluster cluster;
103 private final int otherServerIndex;
104 private final HRegionInfo hri;
105 private int closeCount = 0;
106 static final int SERVER_DURATION = 3 * 1000;
107 static final int CLOSE_DURATION = 1 * 1000;
108
109 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
110 final HRegionInfo closingHRI, final int otherServerIndex) {
111 this.cluster = c;
112 this.metaAddress = metaAddress;
113 this.hri = closingHRI;
114 this.otherServerIndex = otherServerIndex;
115 }
116
117 @Override
118 public boolean process(final RegionServerOperation op) throws IOException {
119 // If a regionserver shutdown and its of the meta server, then we want to
120 // delay the processing of the shutdown and send off a close of a region on
121 // the 'otherServer.
122 boolean result = true;
123 if (op instanceof ProcessServerShutdown) {
124 ProcessServerShutdown pss = (ProcessServerShutdown)op;
125 if (pss.getDeadServerAddress().equals(this.metaAddress)) {
126 // Don't postpone more than once.
127 if (!this.postponed.contains(pss)) {
128 // Close some region.
129 this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
130 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
131 Bytes.toBytes("Forcing close in test")));
132 this.postponed.add(pss);
133 // Put off the processing of the regionserver shutdown processing.
134 pss.setDelay(SERVER_DURATION);
135 this.metaShutdownReceived = true;
136 // Return false. This will add this op to the delayed queue.
137 result = false;
138 }
139 }
140 } else {
141 // Have the close run frequently.
142 if (isWantedCloseOperation(op) != null) {
143 op.setDelay(CLOSE_DURATION);
144 // Count how many times it comes through here.
145 this.closeCount++;
146 }
147 }
148 return result;
149 }
150
151 public void processed(final RegionServerOperation op) {
152 if (isWantedCloseOperation(op) != null) return;
153 this.done = true;
154 }
155 */
156 /*
157 * @param op
158 * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
159 * cast as a ProcessRegionClose.
160 */
161 /*
162 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
163 // Count every time we get a close operation.
164 if (op instanceof ProcessRegionClose) {
165 ProcessRegionClose c = (ProcessRegionClose)op;
166 if (c.regionInfo.equals(hri)) {
167 return c;
168 }
169 }
170 return null;
171 }
172
173 boolean isDone() {
174 return this.done;
175 }
176
177 boolean isMetaShutdownReceived() {
178 return metaShutdownReceived;
179 }
180
181 int getCloseCount() {
182 return this.closeCount;
183 }
184
185 @Override
186 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
187 return true;
188 }
189 }
190 */
191 /**
192 * In 2428, the meta region has just been set offline and then a close comes
193 * in.
194 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a>
195 */
196 @Ignore @Test (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428()
197 throws Exception {
198 /*
199 LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
200 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
201 final HMaster master = cluster.getMaster();
202 int metaIndex = cluster.getServerWithMeta();
203 // Figure the index of the server that is not server the hbase:meta
204 int otherServerIndex = -1;
205 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
206 if (i == metaIndex) continue;
207 otherServerIndex = i;
208 break;
209 }
210 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
211 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
212
213 // Get a region out on the otherServer.
214 final HRegionInfo hri =
215 otherServer.getOnlineRegions().iterator().next().getRegionInfo();
216
217 // Add our RegionServerOperationsListener
218 HBase2428Listener listener = new HBase2428Listener(cluster,
219 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
220 master.getRegionServerOperationQueue().
221 registerRegionServerOperationListener(listener);
222 try {
223 // Now close the server carrying meta.
224 cluster.abortRegionServer(metaIndex);
225
226 // First wait on receipt of meta server shutdown message.
227 while(!listener.metaShutdownReceived) Threads.sleep(100);
228 while(!listener.isDone()) Threads.sleep(10);
229 // We should not have retried the close more times than it took for the
230 // server shutdown message to exit the delay queue and get processed
231 // (Multiple by two to add in some slop in case of GC or something).
232 assertTrue(listener.getCloseCount() > 1);
233 assertTrue(listener.getCloseCount() <
234 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
235
236 // Assert the closed region came back online
237 assertRegionIsBackOnline(hri);
238 } finally {
239 master.getRegionServerOperationQueue().
240 unregisterRegionServerOperationListener(listener);
241 }
242 */
243 }
244
245 /**
246 * Test adding in a new server before old one on same host+port is dead.
247 * Make the test more onerous by having the server under test carry the meta.
248 * If confusion between old and new, purportedly meta never comes back. Test
249 * that meta gets redeployed.
250 */
251 @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413()
252 throws IOException {
253 /*
254 LOG.info("Running testAddingServerBeforeOldIsDead2413");
255 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
256 int count = count();
257 int metaIndex = cluster.getServerWithMeta();
258 MiniHBaseClusterRegionServer metaHRS =
259 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
260 int port = metaHRS.getServerInfo().getServerAddress().getPort();
261 Configuration c = TEST_UTIL.getConfiguration();
262 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
263 try {
264 LOG.info("KILLED=" + metaHRS);
265 metaHRS.kill();
266 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
267 // Try and start new regionserver. It might clash with the old
268 // regionserver port so keep trying to get past the BindException.
269 HRegionServer hrs = null;
270 while (true) {
271 try {
272 hrs = cluster.startRegionServer().getRegionServer();
273 break;
274 } catch (IOException e) {
275 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
276 InvocationTargetException ee = (InvocationTargetException)e.getCause();
277 if (ee.getCause() != null && ee.getCause() instanceof BindException) {
278 LOG.info("BindException; retrying: " + e.toString());
279 }
280 }
281 }
282 }
283 LOG.info("STARTED=" + hrs);
284 // Wait until he's been given at least 3 regions before we go on to try
285 // and count rows in table.
286 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
287 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
288 " regions");
289 assertEquals(count, count());
290 } finally {
291 c.set(HConstants.REGIONSERVER_PORT, oldPort);
292 }
293 */
294 }
295
296 /**
297 * HBase2482 is about outstanding region openings. If any are outstanding
298 * when a regionserver goes down, then they'll never deploy. They'll be
299 * stuck in the regions-in-transition list for ever. This listener looks
300 * for a region opening HMsg and if its from the server passed on construction,
301 * then we kill it. It also looks out for a close message on the victim
302 * server because that signifies start of the fireworks.
303 */
304 /*
305 static class HBase2482Listener implements RegionServerOperationListener {
306 private final HRegionServer victim;
307 private boolean abortSent = false;
308 // We closed regions on new server.
309 private volatile boolean closed = false;
310 // Copy of regions on new server
311 private final Collection<HRegion> copyOfOnlineRegions;
312 // This is the region that was in transition on the server we aborted. Test
313 // passes if this region comes back online successfully.
314 private HRegionInfo regionToFind;
315
316 HBase2482Listener(final HRegionServer victim) {
317 this.victim = victim;
318 // Copy regions currently open on this server so I can notice when
319 // there is a close.
320 this.copyOfOnlineRegions =
321 this.victim.getCopyOfOnlineRegionsSortedBySize().values();
322 }
323
324 @Override
325 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
326 if (!victim.getServerInfo().equals(serverInfo) ||
327 this.abortSent || !this.closed) {
328 return true;
329 }
330 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
331 // Save the region that is in transition so can test later it came back.
332 this.regionToFind = incomingMsg.getRegionInfo();
333 String msg = "ABORTING " + this.victim + " because got a " +
334 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
335 incomingMsg.getRegionInfo().getRegionNameAsString();
336 this.victim.abort(msg);
337 this.abortSent = true;
338 return true;
339 }
340
341 @Override
342 public boolean process(RegionServerOperation op) throws IOException {
343 return true;
344 }
345
346 @Override
347 public void processed(RegionServerOperation op) {
348 if (this.closed || !(op instanceof ProcessRegionClose)) return;
349 ProcessRegionClose close = (ProcessRegionClose)op;
350 for (HRegion r: this.copyOfOnlineRegions) {
351 if (r.getRegionInfo().equals(close.regionInfo)) {
352 // We've closed one of the regions that was on the victim server.
353 // Now can start testing for when all regions are back online again
354 LOG.info("Found close of " +
355 r.getRegionInfo().getRegionNameAsString() +
356 "; setting close happened flag");
357 this.closed = true;
358 break;
359 }
360 }
361 }
362 }
363 */
364 /**
365 * In 2482, a RS with an opening region on it dies. The said region is then
366 * stuck in the master's regions-in-transition and never leaves it. This
367 * test works by bringing up a new regionserver, waiting for the load
368 * balancer to give it some regions. Then, we close all on the new server.
369 * After sending all the close messages, we send the new regionserver the
370 * special blocking message so it can not process any more messages.
371 * Meantime reopening of the just-closed regions is backed up on the new
372 * server. Soon as master gets an opening region from the new regionserver,
373 * we kill it. We then wait on all regions to come back on line. If bug
374 * is fixed, this should happen soon as the processing of the killed server is
375 * done.
376 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a>
377 */
378 @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482()
379 throws Exception {
380 /*
381 LOG.info("Running testKillRSWithOpeningRegion2482");
382 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
383 if (cluster.getLiveRegionServerThreads().size() < 2) {
384 // Need at least two servers.
385 cluster.startRegionServer();
386 }
387 // Count how many regions are online. They need to be all back online for
388 // this test to succeed.
389 int countOfMetaRegions = countOfMetaRegions();
390 // Add a listener on the server.
391 HMaster m = cluster.getMaster();
392 // Start new regionserver.
393 MiniHBaseClusterRegionServer hrs =
394 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
395 LOG.info("Started new regionserver: " + hrs.toString());
396 // Wait until has some regions before proceeding. Balancer will give it some.
397 int minimumRegions =
398 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
399 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
400 // Set the listener only after some regions have been opened on new server.
401 HBase2482Listener listener = new HBase2482Listener(hrs);
402 m.getRegionServerOperationQueue().
403 registerRegionServerOperationListener(listener);
404 try {
405 // Go close all non-catalog regions on this new server
406 closeAllNonCatalogRegions(cluster, hrs);
407 // After all closes, add blocking message before the region opens start to
408 // come in.
409 cluster.addMessageToSendRegionServer(hrs,
410 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
411 // Wait till one of the above close messages has an effect before we start
412 // wait on all regions back online.
413 while (!listener.closed) Threads.sleep(100);
414 LOG.info("Past close");
415 // Make sure the abort server message was sent.
416 while(!listener.abortSent) Threads.sleep(100);
417 LOG.info("Past abort send; waiting on all regions to redeploy");
418 // Now wait for regions to come back online.
419 assertRegionIsBackOnline(listener.regionToFind);
420 } finally {
421 m.getRegionServerOperationQueue().
422 unregisterRegionServerOperationListener(listener);
423 }
424 */
425 }
426
427 /*
428 * @return Count of all non-catalog regions on the designated server
429 */
430 /*
431 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
432 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
433 throws IOException {
434 int countOfRegions = 0;
435 for (HRegion r: hrs.getOnlineRegions()) {
436 if (r.getRegionInfo().isMetaRegion()) continue;
437 cluster.addMessageToSendRegionServer(hrs,
438 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
439 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
440 " on " + hrs.toString());
441 countOfRegions++;
442 }
443 return countOfRegions;
444 }
445
446 private void assertRegionIsBackOnline(final HRegionInfo hri)
447 throws IOException {
448 // Region should have an entry in its startkey because of addRowToEachRegion.
449 byte [] row = getStartKey(hri);
450 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
451 Get g = new Get(row);
452 assertTrue((t.get(g)).size() > 0);
453 }
454
455 /*
456 * @return Count of regions in meta table.
457 * @throws IOException
458 */
459 /*
460 private static int countOfMetaRegions()
461 throws IOException {
462 HTable meta = new HTable(TEST_UTIL.getConfiguration(),
463 HConstants.META_TABLE_NAME);
464 int rows = 0;
465 Scan scan = new Scan();
466 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
467 ResultScanner s = meta.getScanner(scan);
468 for (Result r = null; (r = s.next()) != null;) {
469 byte [] b =
470 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
471 if (b == null || b.length <= 0) break;
472 rows++;
473 }
474 s.close();
475 return rows;
476 }
477 */
478 /*
479 * Add to each of the regions in hbase:meta a value. Key is the startrow of the
480 * region (except its 'aaa' for first region). Actual value is the row name.
481 * @param expected
482 * @return
483 * @throws IOException
484 */
485 private static int addToEachStartKey(final int expected) throws IOException {
486 Table t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
487 Table meta = new HTable(TEST_UTIL.getConfiguration(),
488 TableName.META_TABLE_NAME);
489 int rows = 0;
490 Scan scan = new Scan();
491 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
492 ResultScanner s = meta.getScanner(scan);
493 for (Result r = null; (r = s.next()) != null;) {
494 HRegionInfo hri = HRegionInfo.getHRegionInfo(r);
495 if (hri == null) break;
496 if (!hri.getTable().equals(TABLENAME)) {
497 continue;
498 }
499
500 // If start key, add 'aaa'.
501 if(!hri.getTable().equals(TABLENAME)) {
502 continue;
503 }
504 byte [] row = getStartKey(hri);
505 Put p = new Put(row);
506 p.setDurability(Durability.SKIP_WAL);
507 p.add(getTestFamily(), getTestQualifier(), row);
508 t.put(p);
509 rows++;
510 }
511 s.close();
512 Assert.assertEquals(expected, rows);
513 t.close();
514 meta.close();
515 return rows;
516 }
517
518 /*
519 * @param hri
520 * @return Start key for hri (If start key is '', then return 'aaa'.
521 */
522 private static byte [] getStartKey(final HRegionInfo hri) {
523 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())?
524 Bytes.toBytes("aaa"): hri.getStartKey();
525 }
526
527 private static byte [] getTestFamily() {
528 return FAMILIES[0];
529 }
530
531 private static byte [] getTestQualifier() {
532 return getTestFamily();
533 }
534 }