[976] | 1 | #!/usr/bin/python26
|
---|
| 2 | ###
|
---|
| 3 | ### backmon.commands.check.dssu
|
---|
| 4 | ###
|
---|
| 5 |
|
---|
| 6 | import sys
|
---|
| 7 | import os
|
---|
| 8 | import os.path
|
---|
| 9 | import glob
|
---|
| 10 | import re
|
---|
| 11 |
|
---|
| 12 | from optparse import OptionParser
|
---|
| 13 | from guppy import hpy
|
---|
| 14 |
|
---|
| 15 | from ....lib import *
|
---|
| 16 |
|
---|
| 17 | from backup_monitoring.debug import *
|
---|
| 18 | from backup_monitoring.math import *
|
---|
| 19 |
|
---|
| 20 | from backup_monitoring.parsing.parsers import bpstulist
|
---|
| 21 | from backup_monitoring.parsing.parsers import df
|
---|
| 22 | from backup_monitoring.parsing.parsers import dsu_ls_l
|
---|
| 23 | from backup_monitoring.parsing.parsers import nbstlutil
|
---|
| 24 | from backup_monitoring.parsing.parsers import nbdevquery
|
---|
| 25 | from backup_monitoring.parsing.parsers import bpdbjobs
|
---|
| 26 |
|
---|
| 27 | usage = 'usage: %prog -e [environment] check dssu [dssu]'
|
---|
| 28 |
|
---|
| 29 | parser = OptionParser(usage=usage)
|
---|
| 30 | parser.add_option('-u', '--updating', action='store_true', default=False, dest='updating', help='only include that are updating monitoring data')
|
---|
| 31 |
|
---|
| 32 | def run(args, kwargs):
|
---|
| 33 |
|
---|
| 34 | #
|
---|
| 35 | # add kwargs to local namespace
|
---|
| 36 | #
|
---|
| 37 | for key in kwargs.keys():
|
---|
| 38 |
|
---|
| 39 | if re.compile('^[A-Z][A-Z_]+$').match(key):
|
---|
| 40 | exec(key + ' = kwargs[\'' + key + '\']')
|
---|
| 41 |
|
---|
| 42 | (options, args) = parser.parse_args(args)
|
---|
| 43 |
|
---|
| 44 | if len(args) == 0:
|
---|
| 45 | ERROR('No storage unit specified!')
|
---|
| 46 | sys.exit(3)
|
---|
| 47 |
|
---|
| 48 | STUNIT = args[0]
|
---|
| 49 | STATUS = ExtendedDict()
|
---|
| 50 |
|
---|
| 51 | master_feeds = ['nbemmcmd_machinealias_getaliases', 'bpstulist', 'nbdevquery_listdv_stype_basicdisk', 'nbdevquery_listdv_stype_advanceddisk', 'nbdevquery_listdv_stype_puredisk', 'df', 'nbstlutil_list']
|
---|
| 52 | media_feeds = ['df', 'dsu_ls_l', ]
|
---|
| 53 |
|
---|
| 54 | try:
|
---|
| 55 |
|
---|
| 56 | environments = ENVIRONMENTS.values()
|
---|
| 57 |
|
---|
| 58 | hp = hpy()
|
---|
| 59 |
|
---|
| 60 | DEBUG('HEAPSIZE=%s' % (heapsize()))
|
---|
| 61 |
|
---|
| 62 | for environment in environments:
|
---|
| 63 | environment.load_feeds(master=master_feeds, media=media_feeds)
|
---|
| 64 | environment.parse_aliases()
|
---|
| 65 | environment.parse_stunits()
|
---|
| 66 | environment.parse_df_data()
|
---|
| 67 | environment.parse_dsu_contents()
|
---|
| 68 | environment.parse_lifecycle_images()
|
---|
| 69 | environment.parse_disk_pools()
|
---|
| 70 |
|
---|
| 71 | DEBUG('HEAPSIZE=%s' % (heapsize()))
|
---|
| 72 |
|
---|
| 73 | active_jobs = {}
|
---|
| 74 |
|
---|
| 75 | for environment in environments:
|
---|
| 76 |
|
---|
| 77 | #
|
---|
| 78 | # summarize other stunit statistics
|
---|
| 79 | #
|
---|
| 80 | for stunit in environment.stunits:
|
---|
| 81 |
|
---|
| 82 | state = ExtendedDict()
|
---|
| 83 |
|
---|
| 84 | server = environment.resolve_alias(stunit.host_connection)
|
---|
| 85 | server = server.replace('-backup', '')
|
---|
| 86 |
|
---|
| 87 | if server in environment.updates:
|
---|
| 88 | monitored = True
|
---|
| 89 | else:
|
---|
| 90 | monitored = False
|
---|
| 91 |
|
---|
| 92 | label = stunit.label
|
---|
| 93 |
|
---|
| 94 | #
|
---|
| 95 | # DSSU specific
|
---|
| 96 | #
|
---|
| 97 | if stunit.storage_unit_type == 'Disk' and stunit.media_subtype == 'Basic' and stunit.stage_data == 'yes':
|
---|
| 98 |
|
---|
| 99 | path = stunit.path
|
---|
| 100 |
|
---|
| 101 | media_subtype = stunit.media_subtype
|
---|
| 102 | disk_type = ''
|
---|
| 103 | disk_pool = None
|
---|
| 104 |
|
---|
| 105 | #
|
---|
| 106 | # metrics from nbdevquery
|
---|
| 107 | #
|
---|
| 108 | if( media_subtype == 'Basic' and label in environment.disk_pools):
|
---|
| 109 |
|
---|
| 110 | disk_pool = environment.disk_pools[label]
|
---|
| 111 |
|
---|
| 112 | if( media_subtype == 'DiskPool' and stunit.disk_pool in environment.disk_pools):
|
---|
| 113 |
|
---|
| 114 | disk_pool = environment.disk_pools[stunit.disk_pool]
|
---|
| 115 |
|
---|
| 116 | status = ''
|
---|
| 117 | size = ''
|
---|
| 118 | used = ''
|
---|
| 119 | avail = ''
|
---|
| 120 | pct_full = 100.0
|
---|
| 121 | pct_free = 0.0
|
---|
| 122 | written = ''
|
---|
| 123 | jobs = ''
|
---|
| 124 | images = ''
|
---|
| 125 | staged = ''
|
---|
| 126 | pct_staged = 0.0
|
---|
| 127 | pct_not_staged = 100.0
|
---|
| 128 | backlog = ''
|
---|
| 129 | pct_usable = 0.0
|
---|
| 130 | pct_not_usable = 100.0
|
---|
| 131 |
|
---|
| 132 | #
|
---|
| 133 | # get dsu size and available capacity from df -k
|
---|
| 134 | #
|
---|
| 135 | if server in environment.df_data:
|
---|
| 136 |
|
---|
| 137 | while True:
|
---|
| 138 |
|
---|
| 139 | #
|
---|
| 140 | # path matches a mountpoint
|
---|
| 141 | #
|
---|
| 142 | if path in environment.df_data[server]:
|
---|
| 143 |
|
---|
| 144 | fs = environment.df_data[server][path]
|
---|
| 145 |
|
---|
| 146 | fs_bytes_size = fs.kbytes * 1024.0
|
---|
| 147 | size = pp_bytes(fs_bytes_size)
|
---|
| 148 |
|
---|
| 149 | fs_bytes_used = fs.used * 1024.0
|
---|
| 150 | used = pp_bytes(fs_bytes_used)
|
---|
| 151 |
|
---|
| 152 | fs_bytes_avail = fs.avail * 1024.0
|
---|
| 153 | avail = pp_bytes(fs_bytes_avail)
|
---|
| 154 |
|
---|
| 155 | pct_full = fs.pct_full
|
---|
| 156 | pct_free = 100.0 - pct_full
|
---|
| 157 |
|
---|
| 158 | break
|
---|
| 159 |
|
---|
| 160 | #
|
---|
| 161 | # path is a top-level subdirectory of a mountpoint
|
---|
| 162 | #
|
---|
| 163 | head, tail = os.path.split(path)
|
---|
| 164 | if head in environment.df_data[server]:
|
---|
| 165 |
|
---|
| 166 | fs = environment.df_data[server][head]
|
---|
| 167 |
|
---|
| 168 | fs_bytes_size = fs.kbytes * 1024.0
|
---|
| 169 | size = pp_bytes(fs_bytes_size)
|
---|
| 170 |
|
---|
| 171 | fs_bytes_used = fs.used * 1024.0
|
---|
| 172 | used = pp_bytes(fs_bytes_used)
|
---|
| 173 |
|
---|
| 174 | fs_bytes_avail = fs.avail * 1024.0
|
---|
| 175 | avail = pp_bytes(fs_bytes_avail)
|
---|
| 176 |
|
---|
| 177 | pct_full = fs.pct_full
|
---|
| 178 | pct_free = 100.0 - pct_full
|
---|
| 179 |
|
---|
| 180 | break
|
---|
| 181 |
|
---|
| 182 | #
|
---|
| 183 | # break if we cannot match stunit path to a mountpoint
|
---|
| 184 | #
|
---|
| 185 | break
|
---|
| 186 |
|
---|
| 187 | #
|
---|
| 188 | # get size and capacity from nbdevquery if not available from df -k
|
---|
| 189 | #
|
---|
| 190 | elif disk_pool:
|
---|
| 191 |
|
---|
| 192 | size = pp_bytes(int(float(disk_pool.total_capacity_gb) * 1024.0 * 1024.0 * 1024.0))
|
---|
| 193 | used = pp_bytes(((float(disk_pool.total_capacity_gb) - float(disk_pool.free_space_gb)) * 1024.0 * 1024.0 * 1024.0))
|
---|
| 194 | avail = pp_bytes(int(float(disk_pool.free_space_gb) * 1024.0 * 1024.0 * 1024.0))
|
---|
| 195 |
|
---|
| 196 | if( disk_pool.total_capacity_gb > 0.0 ):
|
---|
| 197 | pct_full = (((float(disk_pool.total_capacity_gb) - float(disk_pool.free_space_gb)) / float(disk_pool.total_capacity_gb)) * 100.0)
|
---|
| 198 | else:
|
---|
| 199 | pct_full = disk_pool.use_pct
|
---|
| 200 |
|
---|
| 201 | pct_free = 100.0 - pct_full
|
---|
| 202 |
|
---|
| 203 | #
|
---|
| 204 | # get number of images and amount staged
|
---|
| 205 | #
|
---|
| 206 | dsu_image_sizes = {}
|
---|
| 207 | re_image_id = re.compile('^([a-zA-Z0-9\-_.]+_[0-9]+)_([a-zA-Z0-9\-_.]+)\.([a-zA-Z]+)$')
|
---|
| 208 |
|
---|
| 209 | #
|
---|
| 210 | # get dsu usage from ls -l
|
---|
| 211 | #
|
---|
| 212 | if server in environment.dsu_contents:
|
---|
| 213 |
|
---|
| 214 | if path in environment.dsu_contents[server]:
|
---|
| 215 |
|
---|
| 216 | dir = environment.dsu_contents[server][path]
|
---|
| 217 |
|
---|
| 218 | ls_bytes_used = 0
|
---|
| 219 |
|
---|
| 220 | for file in dir.files:
|
---|
| 221 |
|
---|
| 222 | ls_bytes_used += file.size
|
---|
| 223 |
|
---|
| 224 | match = re_image_id.match(file.filename)
|
---|
| 225 |
|
---|
| 226 | if match:
|
---|
| 227 |
|
---|
| 228 | image_id = match.group(1)
|
---|
| 229 |
|
---|
| 230 | if image_id not in dsu_image_sizes:
|
---|
| 231 | dsu_image_sizes[image_id] = 0
|
---|
| 232 |
|
---|
| 233 | dsu_image_sizes[image_id] += file.size
|
---|
| 234 |
|
---|
| 235 | written = pp_bytes(ls_bytes_used)
|
---|
| 236 |
|
---|
| 237 | image_count = 0
|
---|
| 238 | image_bytes = 0
|
---|
| 239 |
|
---|
| 240 | staged_count = 0
|
---|
| 241 | staged_bytes = 0
|
---|
| 242 |
|
---|
| 243 | backlog_count = 0
|
---|
| 244 | backlog_bytes = 0
|
---|
| 245 |
|
---|
| 246 | for image_id, image_size in dsu_image_sizes.items():
|
---|
| 247 |
|
---|
| 248 | image_count += 1
|
---|
| 249 | image_bytes += image_size
|
---|
| 250 |
|
---|
| 251 | if image_id in environment.lifecycle_images:
|
---|
| 252 |
|
---|
| 253 | staged_count += 1
|
---|
| 254 | staged_bytes += image_size
|
---|
| 255 |
|
---|
| 256 | else:
|
---|
| 257 |
|
---|
| 258 | backlog_count += 1
|
---|
| 259 | backlog_bytes += image_size
|
---|
| 260 |
|
---|
| 261 | images = '%d' % (image_count)
|
---|
| 262 | staged = '%d' % (staged_count)
|
---|
| 263 |
|
---|
| 264 | if image_bytes > 0:
|
---|
| 265 | pct_staged = ((float(staged_bytes) / float(image_bytes)) * 100.0)
|
---|
| 266 | pct_not_staged = (100.0 - ((float(staged_bytes) / float(image_bytes)) * 100.0))
|
---|
| 267 |
|
---|
| 268 | if image_count == 0:
|
---|
| 269 | pct_staged = 100.0
|
---|
| 270 | pct_not_staged = 0.0
|
---|
| 271 |
|
---|
| 272 | if image_count > staged_count:
|
---|
| 273 | backlog = pp_bytes(backlog_bytes)
|
---|
| 274 |
|
---|
| 275 | #
|
---|
| 276 | # status from nbdevquery
|
---|
| 277 | #
|
---|
| 278 | if disk_pool:
|
---|
| 279 |
|
---|
| 280 | if( 'AdminUp' in disk_pool.flags and 'InternalUp' in disk_pool.flags ):
|
---|
| 281 | status = 'UP'
|
---|
| 282 | else:
|
---|
| 283 | status = 'DOWN'
|
---|
| 284 |
|
---|
| 285 | #
|
---|
| 286 | #
|
---|
| 287 | #
|
---|
| 288 | pct_usable = pct_free + pct_staged
|
---|
| 289 | pct_not_usable = 100.0 - pct_usable
|
---|
| 290 |
|
---|
| 291 | state['status'] = status
|
---|
| 292 | state['pct_full'] = pct_full
|
---|
| 293 | state['pct_free'] = pct_free
|
---|
| 294 | state['pct_staged'] = pct_staged
|
---|
| 295 | state['pct_not_staged'] = pct_not_staged
|
---|
| 296 | state['pct_usable'] = pct_usable
|
---|
| 297 | state['pct_not_usable'] = pct_not_usable
|
---|
| 298 |
|
---|
| 299 | STATUS[label] = state
|
---|
| 300 |
|
---|
| 301 | if STUNIT not in STATUS:
|
---|
| 302 |
|
---|
| 303 | perfdata = 'perfdata=100;0;0'
|
---|
| 304 | ERROR('Could not find storage unit %s in monitoring data | %s' % (STUNIT, perfdata))
|
---|
| 305 | sys.exit(3)
|
---|
| 306 |
|
---|
| 307 | else:
|
---|
| 308 |
|
---|
| 309 | state = STATUS[STUNIT]
|
---|
| 310 |
|
---|
| 311 | perfdata = 'perfdata=%.0f;%.0f;%.0f' % (state.pct_full, state.pct_staged / 100.0 * state.pct_full, state.pct_staged)
|
---|
| 312 |
|
---|
| 313 | if state.status == 'UP':
|
---|
| 314 | print('OK: Storage unit is up. | %s' % (perfdata))
|
---|
| 315 | sys.exit(0)
|
---|
| 316 |
|
---|
| 317 | if state.status == 'DOWN':
|
---|
| 318 | print('Storage unit is DOWN. | %s' % (perfdata))
|
---|
| 319 | sys.exit(2)
|
---|
| 320 |
|
---|
| 321 |
|
---|
| 322 | except Exception, e:
|
---|
| 323 |
|
---|
| 324 | perfdata = 'perfdata=100;0;0'
|
---|
| 325 | #print 'Monitoring data unavailable: %s | %s' % (e, perfdata)
|
---|
| 326 | print 'DSSU monitoring data unavailable! | %s' % (perfdata)
|
---|
| 327 | sys.exit(3)
|
---|
| 328 |
|
---|