1 | #!/usr/bin/python26
|
---|
2 | ###
|
---|
3 | ### backmon.commands.check.dssu
|
---|
4 | ###
|
---|
5 |
|
---|
6 | import sys
|
---|
7 | import os
|
---|
8 | import os.path
|
---|
9 | import glob
|
---|
10 | import re
|
---|
11 |
|
---|
12 | from optparse import OptionParser
|
---|
13 | from guppy import hpy
|
---|
14 |
|
---|
15 | from ....lib import *
|
---|
16 |
|
---|
17 | from backup_monitoring.debug import *
|
---|
18 | from backup_monitoring.math import *
|
---|
19 |
|
---|
20 | from backup_monitoring.parsing.parsers import bpstulist
|
---|
21 | from backup_monitoring.parsing.parsers import df
|
---|
22 | from backup_monitoring.parsing.parsers import dsu_ls_l
|
---|
23 | from backup_monitoring.parsing.parsers import nbstlutil
|
---|
24 | from backup_monitoring.parsing.parsers import nbdevquery
|
---|
25 | from backup_monitoring.parsing.parsers import bpdbjobs
|
---|
26 |
|
---|
27 | usage = 'usage: %prog -e [environment] check dssu [dssu]'
|
---|
28 |
|
---|
29 | parser = OptionParser(usage=usage)
|
---|
30 | parser.add_option('-u', '--updating', action='store_true', default=False, dest='updating', help='only include that are updating monitoring data')
|
---|
31 |
|
---|
32 | def run(args, kwargs):
|
---|
33 |
|
---|
34 | #
|
---|
35 | # add kwargs to local namespace
|
---|
36 | #
|
---|
37 | for key in kwargs.keys():
|
---|
38 |
|
---|
39 | if re.compile('^[A-Z][A-Z_]+$').match(key):
|
---|
40 | exec(key + ' = kwargs[\'' + key + '\']')
|
---|
41 |
|
---|
42 | (options, args) = parser.parse_args(args)
|
---|
43 |
|
---|
44 | if len(args) == 0:
|
---|
45 | ERROR('No storage unit specified!')
|
---|
46 | sys.exit(3)
|
---|
47 |
|
---|
48 | STUNIT = args[0]
|
---|
49 | STATUS = ExtendedDict()
|
---|
50 |
|
---|
51 | master_feeds = ['nbemmcmd_machinealias_getaliases', 'bpstulist', 'nbdevquery_listdv_stype_basicdisk', 'nbdevquery_listdv_stype_advanceddisk', 'nbdevquery_listdv_stype_puredisk', 'df', 'nbstlutil_list']
|
---|
52 | media_feeds = ['df', 'dsu_ls_l', ]
|
---|
53 |
|
---|
54 | try:
|
---|
55 |
|
---|
56 | environments = ENVIRONMENTS.values()
|
---|
57 |
|
---|
58 | hp = hpy()
|
---|
59 |
|
---|
60 | DEBUG('HEAPSIZE=%s' % (heapsize()))
|
---|
61 |
|
---|
62 | for environment in environments:
|
---|
63 | environment.load_feeds(master=master_feeds, media=media_feeds)
|
---|
64 | environment.parse_aliases()
|
---|
65 | environment.parse_stunits()
|
---|
66 | environment.parse_df_data()
|
---|
67 | environment.parse_dsu_contents()
|
---|
68 | environment.parse_lifecycle_images()
|
---|
69 | environment.parse_disk_pools()
|
---|
70 |
|
---|
71 | DEBUG('HEAPSIZE=%s' % (heapsize()))
|
---|
72 |
|
---|
73 | active_jobs = {}
|
---|
74 |
|
---|
75 | for environment in environments:
|
---|
76 |
|
---|
77 | #
|
---|
78 | # summarize other stunit statistics
|
---|
79 | #
|
---|
80 | for stunit in environment.stunits:
|
---|
81 |
|
---|
82 | state = ExtendedDict()
|
---|
83 |
|
---|
84 | server = environment.resolve_alias(stunit.host_connection)
|
---|
85 | server = server.replace('-backup', '')
|
---|
86 |
|
---|
87 | if server in environment.updates:
|
---|
88 | monitored = True
|
---|
89 | else:
|
---|
90 | monitored = False
|
---|
91 |
|
---|
92 | label = stunit.label
|
---|
93 |
|
---|
94 | #
|
---|
95 | # DSSU specific
|
---|
96 | #
|
---|
97 | if stunit.storage_unit_type == 'Disk' and stunit.media_subtype == 'Basic' and stunit.stage_data == 'yes':
|
---|
98 |
|
---|
99 | path = stunit.path
|
---|
100 |
|
---|
101 | media_subtype = stunit.media_subtype
|
---|
102 | disk_type = ''
|
---|
103 | disk_pool = None
|
---|
104 |
|
---|
105 | #
|
---|
106 | # metrics from nbdevquery
|
---|
107 | #
|
---|
108 | if( media_subtype == 'Basic' and label in environment.disk_pools):
|
---|
109 |
|
---|
110 | disk_pool = environment.disk_pools[label]
|
---|
111 |
|
---|
112 | if( media_subtype == 'DiskPool' and stunit.disk_pool in environment.disk_pools):
|
---|
113 |
|
---|
114 | disk_pool = environment.disk_pools[stunit.disk_pool]
|
---|
115 |
|
---|
116 | status = ''
|
---|
117 | size = ''
|
---|
118 | used = ''
|
---|
119 | avail = ''
|
---|
120 | pct_full = 100.0
|
---|
121 | pct_free = 0.0
|
---|
122 | written = ''
|
---|
123 | jobs = ''
|
---|
124 | images = ''
|
---|
125 | staged = ''
|
---|
126 | pct_staged = 0.0
|
---|
127 | pct_not_staged = 100.0
|
---|
128 | backlog = ''
|
---|
129 | pct_usable = 0.0
|
---|
130 | pct_not_usable = 100.0
|
---|
131 |
|
---|
132 | #
|
---|
133 | # get dsu size and available capacity from df -k
|
---|
134 | #
|
---|
135 | if server in environment.df_data:
|
---|
136 |
|
---|
137 | while True:
|
---|
138 |
|
---|
139 | #
|
---|
140 | # path matches a mountpoint
|
---|
141 | #
|
---|
142 | if path in environment.df_data[server]:
|
---|
143 |
|
---|
144 | fs = environment.df_data[server][path]
|
---|
145 |
|
---|
146 | fs_bytes_size = fs.kbytes * 1024.0
|
---|
147 | size = pp_bytes(fs_bytes_size)
|
---|
148 |
|
---|
149 | fs_bytes_used = fs.used * 1024.0
|
---|
150 | used = pp_bytes(fs_bytes_used)
|
---|
151 |
|
---|
152 | fs_bytes_avail = fs.avail * 1024.0
|
---|
153 | avail = pp_bytes(fs_bytes_avail)
|
---|
154 |
|
---|
155 | pct_full = fs.pct_full
|
---|
156 | pct_free = 100.0 - pct_full
|
---|
157 |
|
---|
158 | break
|
---|
159 |
|
---|
160 | #
|
---|
161 | # path is a top-level subdirectory of a mountpoint
|
---|
162 | #
|
---|
163 | head, tail = os.path.split(path)
|
---|
164 | if head in environment.df_data[server]:
|
---|
165 |
|
---|
166 | fs = environment.df_data[server][head]
|
---|
167 |
|
---|
168 | fs_bytes_size = fs.kbytes * 1024.0
|
---|
169 | size = pp_bytes(fs_bytes_size)
|
---|
170 |
|
---|
171 | fs_bytes_used = fs.used * 1024.0
|
---|
172 | used = pp_bytes(fs_bytes_used)
|
---|
173 |
|
---|
174 | fs_bytes_avail = fs.avail * 1024.0
|
---|
175 | avail = pp_bytes(fs_bytes_avail)
|
---|
176 |
|
---|
177 | pct_full = fs.pct_full
|
---|
178 | pct_free = 100.0 - pct_full
|
---|
179 |
|
---|
180 | break
|
---|
181 |
|
---|
182 | #
|
---|
183 | # break if we cannot match stunit path to a mountpoint
|
---|
184 | #
|
---|
185 | break
|
---|
186 |
|
---|
187 | #
|
---|
188 | # get size and capacity from nbdevquery if not available from df -k
|
---|
189 | #
|
---|
190 | elif disk_pool:
|
---|
191 |
|
---|
192 | size = pp_bytes(int(float(disk_pool.total_capacity_gb) * 1024.0 * 1024.0 * 1024.0))
|
---|
193 | used = pp_bytes(((float(disk_pool.total_capacity_gb) - float(disk_pool.free_space_gb)) * 1024.0 * 1024.0 * 1024.0))
|
---|
194 | avail = pp_bytes(int(float(disk_pool.free_space_gb) * 1024.0 * 1024.0 * 1024.0))
|
---|
195 |
|
---|
196 | if( disk_pool.total_capacity_gb > 0.0 ):
|
---|
197 | pct_full = (((float(disk_pool.total_capacity_gb) - float(disk_pool.free_space_gb)) / float(disk_pool.total_capacity_gb)) * 100.0)
|
---|
198 | else:
|
---|
199 | pct_full = disk_pool.use_pct
|
---|
200 |
|
---|
201 | pct_free = 100.0 - pct_full
|
---|
202 |
|
---|
203 | #
|
---|
204 | # get number of images and amount staged
|
---|
205 | #
|
---|
206 | dsu_image_sizes = {}
|
---|
207 | re_image_id = re.compile('^([a-zA-Z0-9\-_.]+_[0-9]+)_([a-zA-Z0-9\-_.]+)\.([a-zA-Z]+)$')
|
---|
208 |
|
---|
209 | #
|
---|
210 | # get dsu usage from ls -l
|
---|
211 | #
|
---|
212 | if server in environment.dsu_contents:
|
---|
213 |
|
---|
214 | if path in environment.dsu_contents[server]:
|
---|
215 |
|
---|
216 | dir = environment.dsu_contents[server][path]
|
---|
217 |
|
---|
218 | ls_bytes_used = 0
|
---|
219 |
|
---|
220 | for file in dir.files:
|
---|
221 |
|
---|
222 | ls_bytes_used += file.size
|
---|
223 |
|
---|
224 | match = re_image_id.match(file.filename)
|
---|
225 |
|
---|
226 | if match:
|
---|
227 |
|
---|
228 | image_id = match.group(1)
|
---|
229 |
|
---|
230 | if image_id not in dsu_image_sizes:
|
---|
231 | dsu_image_sizes[image_id] = 0
|
---|
232 |
|
---|
233 | dsu_image_sizes[image_id] += file.size
|
---|
234 |
|
---|
235 | written = pp_bytes(ls_bytes_used)
|
---|
236 |
|
---|
237 | image_count = 0
|
---|
238 | image_bytes = 0
|
---|
239 |
|
---|
240 | staged_count = 0
|
---|
241 | staged_bytes = 0
|
---|
242 |
|
---|
243 | backlog_count = 0
|
---|
244 | backlog_bytes = 0
|
---|
245 |
|
---|
246 | for image_id, image_size in dsu_image_sizes.items():
|
---|
247 |
|
---|
248 | image_count += 1
|
---|
249 | image_bytes += image_size
|
---|
250 |
|
---|
251 | if image_id in environment.lifecycle_images:
|
---|
252 |
|
---|
253 | staged_count += 1
|
---|
254 | staged_bytes += image_size
|
---|
255 |
|
---|
256 | else:
|
---|
257 |
|
---|
258 | backlog_count += 1
|
---|
259 | backlog_bytes += image_size
|
---|
260 |
|
---|
261 | images = '%d' % (image_count)
|
---|
262 | staged = '%d' % (staged_count)
|
---|
263 |
|
---|
264 | if image_bytes > 0:
|
---|
265 | pct_staged = ((float(staged_bytes) / float(image_bytes)) * 100.0)
|
---|
266 | pct_not_staged = (100.0 - ((float(staged_bytes) / float(image_bytes)) * 100.0))
|
---|
267 |
|
---|
268 | if image_count == 0:
|
---|
269 | pct_staged = 100.0
|
---|
270 | pct_not_staged = 0.0
|
---|
271 |
|
---|
272 | if image_count > staged_count:
|
---|
273 | backlog = pp_bytes(backlog_bytes)
|
---|
274 |
|
---|
275 | #
|
---|
276 | # status from nbdevquery
|
---|
277 | #
|
---|
278 | if disk_pool:
|
---|
279 |
|
---|
280 | if( 'AdminUp' in disk_pool.flags and 'InternalUp' in disk_pool.flags ):
|
---|
281 | status = 'UP'
|
---|
282 | else:
|
---|
283 | status = 'DOWN'
|
---|
284 |
|
---|
285 | #
|
---|
286 | #
|
---|
287 | #
|
---|
288 | pct_usable = pct_free + pct_staged
|
---|
289 | pct_not_usable = 100.0 - pct_usable
|
---|
290 |
|
---|
291 | state['status'] = status
|
---|
292 | state['pct_full'] = pct_full
|
---|
293 | state['pct_free'] = pct_free
|
---|
294 | state['pct_staged'] = pct_staged
|
---|
295 | state['pct_not_staged'] = pct_not_staged
|
---|
296 | state['pct_usable'] = pct_usable
|
---|
297 | state['pct_not_usable'] = pct_not_usable
|
---|
298 |
|
---|
299 | STATUS[label] = state
|
---|
300 |
|
---|
301 | if STUNIT not in STATUS:
|
---|
302 |
|
---|
303 | perfdata = 'perfdata=100;0;0'
|
---|
304 | ERROR('Could not find storage unit %s in monitoring data | %s' % (STUNIT, perfdata))
|
---|
305 | sys.exit(3)
|
---|
306 |
|
---|
307 | else:
|
---|
308 |
|
---|
309 | state = STATUS[STUNIT]
|
---|
310 |
|
---|
311 | perfdata = 'perfdata=%.0f;%.0f;%.0f' % (state.pct_full, state.pct_staged / 100.0 * state.pct_full, state.pct_staged)
|
---|
312 |
|
---|
313 | if state.status == 'UP':
|
---|
314 | print('OK: Storage unit is up. | %s' % (perfdata))
|
---|
315 | sys.exit(0)
|
---|
316 |
|
---|
317 | if state.status == 'DOWN':
|
---|
318 | print('Storage unit is DOWN. | %s' % (perfdata))
|
---|
319 | sys.exit(2)
|
---|
320 |
|
---|
321 |
|
---|
322 | except Exception, e:
|
---|
323 |
|
---|
324 | perfdata = 'perfdata=100;0;0'
|
---|
325 | #print 'Monitoring data unavailable: %s | %s' % (e, perfdata)
|
---|
326 | print 'DSSU monitoring data unavailable! | %s' % (perfdata)
|
---|
327 | sys.exit(3)
|
---|
328 |
|
---|