|
21 | 21 | import java.util.HashMap; |
22 | 22 | import java.util.List; |
23 | 23 | import java.util.Map; |
| 24 | +import java.util.Optional; |
24 | 25 | import java.util.concurrent.Executors; |
25 | 26 | import java.util.concurrent.ScheduledExecutorService; |
26 | 27 | import java.util.concurrent.TimeUnit; |
|
41 | 42 | import org.apache.cloudstack.managed.context.ManagedContext; |
42 | 43 | import org.apache.cloudstack.managed.context.ManagedContextRunnable; |
43 | 44 | import org.apache.cloudstack.management.ManagementServerHost; |
| 45 | +import org.apache.commons.collections.CollectionUtils; |
44 | 46 | import org.apache.log4j.Logger; |
45 | 47 | import org.apache.log4j.NDC; |
46 | 48 |
|
|
71 | 73 | import com.cloud.network.VpcVirtualNetworkApplianceService; |
72 | 74 | import com.cloud.resource.ResourceManager; |
73 | 75 | import com.cloud.server.ManagementServer; |
74 | | -import com.cloud.service.ServiceOfferingVO; |
75 | 76 | import com.cloud.service.dao.ServiceOfferingDao; |
76 | 77 | import com.cloud.storage.Storage.StoragePoolType; |
77 | 78 | import com.cloud.storage.StorageManager; |
@@ -223,6 +224,18 @@ public void setHaPlanners(List<HAPlanner> haPlanners) { |
223 | 224 | long _timeBetweenCleanups; |
224 | 225 | String _haTag = null; |
225 | 226 |
|
| 227 | + private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks, final VMInstanceVO vm) { |
| 228 | + Optional<HaWorkVO> item = pendingHaWorks.stream() |
| 229 | + .filter(h -> h.getInstanceId() == vm.getId()) |
| 230 | + .reduce((first, second) -> second); |
| 231 | + if (item.isPresent() && (item.get().getTimesTried() < _maxRetries || |
| 232 | + !item.get().canScheduleNew(_timeBetweenFailures))) { |
| 233 | + s_logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm)); |
| 234 | + return true; |
| 235 | + } |
| 236 | + return false; |
| 237 | + } |
| 238 | + |
226 | 239 | protected HighAvailabilityManagerImpl() { |
227 | 240 | } |
228 | 241 |
|
@@ -265,36 +278,44 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate) |
265 | 278 | s_logger.warn("Scheduling restart for VMs on host " + host.getId() + "-" + host.getName()); |
266 | 279 |
|
267 | 280 | final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId()); |
| 281 | + final List<HaWorkVO> pendingHaWorks = _haDao.listPendingHAWorkForHost(host.getId()); |
268 | 282 | final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); |
269 | 283 |
|
270 | 284 | // send an email alert that the host is down |
271 | 285 | StringBuilder sb = null; |
272 | 286 | List<VMInstanceVO> reorderedVMList = new ArrayList<VMInstanceVO>(); |
273 | | - if ((vms != null) && !vms.isEmpty()) { |
| 287 | + int skippedHAVms = 0; |
| 288 | + if (CollectionUtils.isNotEmpty(vms)) { |
274 | 289 | sb = new StringBuilder(); |
275 | 290 | sb.append(" Starting HA on the following VMs:"); |
276 | 291 | // collect list of vm names for the alert email |
277 | | - for (int i = 0; i < vms.size(); i++) { |
278 | | - VMInstanceVO vm = vms.get(i); |
| 292 | + for (VMInstanceVO vm : vms) { |
| 293 | + if (vmHasPendingHAJob(pendingHaWorks, vm)) { |
| 294 | + skippedHAVms++; |
| 295 | + continue; |
| 296 | + } |
279 | 297 | if (vm.getType() == VirtualMachine.Type.User) { |
280 | 298 | reorderedVMList.add(vm); |
281 | 299 | } else { |
282 | 300 | reorderedVMList.add(0, vm); |
283 | 301 | } |
284 | 302 | if (vm.isHaEnabled()) { |
285 | | - sb.append(" " + vm.getHostName()); |
| 303 | + sb.append(" ").append(vm.getHostName()); |
286 | 304 | } |
287 | 305 | } |
288 | 306 | } |
289 | | - |
| 307 | + if (reorderedVMList.isEmpty() && skippedHAVms > 0 && skippedHAVms == vms.size()) { |
| 308 | + s_logger.debug(String.format( |
| 309 | + "Skipping sending alert for %s as it is suspected to be a duplicate of a recent alert", host)); |
| 310 | + return; |
| 311 | + } |
290 | 312 | // send an email alert that the host is down, include VMs |
291 | 313 | HostPodVO podVO = _podDao.findById(host.getPodId()); |
292 | 314 | String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); |
293 | 315 | _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host is down, " + hostDesc, |
294 | 316 | "Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : "")); |
295 | 317 |
|
296 | 318 | for (VMInstanceVO vm : reorderedVMList) { |
297 | | - ServiceOfferingVO vmOffering = _serviceOfferingDao.findById(vm.getServiceOfferingId()); |
298 | 319 | if (_itMgr.isRootVolumeOnLocalStorage(vm.getId())) { |
299 | 320 | if (s_logger.isDebugEnabled()){ |
300 | 321 | s_logger.debug("Skipping HA on vm " + vm + ", because it uses local storage. Its fate is tied to the host."); |
|
0 commit comments