|
| 1 | +# Copyright (c) 2011 Mathieu Turcotte |
| 2 | +# Licensed under the MIT license. |
| 3 | +# |
| 4 | +# Permission is hereby granted, free of charge, to any person obtaining a copy of |
| 5 | +# this software and associated documentation files (the "Software"), to deal in |
| 6 | +# the Software without restriction, including without limitation the rights to |
| 7 | +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies |
| 8 | +# of the Software, and to permit persons to whom the Software is furnished to do |
| 9 | +# so, subject to the following conditions: |
| 10 | +# |
| 11 | +# The above copyright notice and this permission notice shall be included in all |
| 12 | +# copies or substantial portions of the Software. |
| 13 | +# |
| 14 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 15 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 16 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 17 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 18 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 19 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 20 | +# SOFTWARE. |
| 21 | + |
| 22 | +""" |
| 23 | +The msparser module offers a simple interface to parse the Valgrind massif.out |
| 24 | +file format, i.e. data files produced the Valgrind heap profiler. |
| 25 | +""" |
| 26 | + |
| 27 | +from __future__ import with_statement # Enable with statement in Python 2.5. |
| 28 | +import os.path |
| 29 | +import re |
| 30 | + |
| 31 | +__all__ = ["parse", "parse_file", "ParseError"] |
| 32 | + |
| 33 | +# Precompiled regex used to parse comments. |
| 34 | +_COMMENT_RE = re.compile("\s*(#|$)") |
| 35 | + |
| 36 | +# Precompiled regexes used to parse header fields. |
| 37 | +_FIELD_DESC_RE = re.compile("desc:\s(?P<data>.*)$") |
| 38 | +_FIELD_CMD_RE = re.compile("cmd:\s(?P<data>.*)$") |
| 39 | +_FIELD_TIME_UNIT_RE = re.compile("time_unit:\s(?P<data>ms|B|i)$") |
| 40 | + |
| 41 | +# Precompiled regexes used to parse snaphot fields. |
| 42 | +_FIELD_SNAPSHOT_RE = re.compile("snapshot=(?P<data>\d+)") |
| 43 | +_FIELD_TIME_RE = re.compile("time=(?P<data>\d+)") |
| 44 | +_FIELD_MEM_HEAP_RE = re.compile("mem_heap_B=(?P<data>\d+)") |
| 45 | +_FIELD_MEM_EXTRA_RE = re.compile("mem_heap_extra_B=(?P<data>\d+)") |
| 46 | +_FIELD_MEM_STACK_RE = re.compile("mem_stacks_B=(?P<data>\d+)") |
| 47 | +_FIELD_HEAP_TREE_RE = re.compile("heap_tree=(?P<data>\w+)") |
| 48 | + |
| 49 | +# Precompiled regex to parse heap entries. Matches three things: |
| 50 | +# - the number of children, |
| 51 | +# - the number of bytes, |
| 52 | +# - and the details section. |
| 53 | +_HEAP_ENTRY_RE = re.compile(""" |
| 54 | + \s*n # skip zero or more spaces, then 'n' |
| 55 | + (?P<num_children>\d+) # match number of children, 1 or more digits |
| 56 | + :\s # skip ':' and one space |
| 57 | + (?P<num_bytes>\d+) # match the number of bytes, 1 or more digits |
| 58 | + \s # skip one space |
| 59 | + (?P<details>.*) # match the details |
| 60 | +""", re.VERBOSE) |
| 61 | + |
| 62 | +# Precompiled regex to check if the details section is below threshold. |
| 63 | +_HEAP_BELOW_THRESHOLD_RE = re.compile(r"""in.*places?.*""") |
| 64 | + |
| 65 | +# Precompiled regex to parse the details section of entries above threshold. |
| 66 | +# This should match four things: |
| 67 | +# - the hexadecimal address, |
| 68 | +# - the function name, |
| 69 | +# - the file name or binary path, i.e. file.cpp or usr/local/bin/foo.so, |
| 70 | +# - and a line number if present. |
| 71 | +# Last two parts are optional to handle entries without a file name or binary |
| 72 | +# path. |
| 73 | +_HEAP_DETAILS_RE = re.compile(r""" |
| 74 | + (?P<address>[a-fA-F0-9x]+) # match the hexadecimal address |
| 75 | + :\s # skip ': ' |
| 76 | + (?P<function>.+?) # match the function's name, non-greedy |
| 77 | + (?: # don't capture fname/line group |
| 78 | + \s |
| 79 | + \( |
| 80 | + (?:in\s)? # skip 'in ' if present |
| 81 | + (?P<fname>[^:]+) # match the file name |
| 82 | + :? # skip ':', if present |
| 83 | + (?P<line>\d+)? # match the line number, if present |
| 84 | + \) |
| 85 | + )? # fname/line group is optional |
| 86 | + $ # should have reached the EOL |
| 87 | +""", re.VERBOSE) |
| 88 | + |
| 89 | + |
| 90 | +class ParseContext: |
| 91 | + """ |
| 92 | + A simple context for parsing. Dumbed down version of fileinput. |
| 93 | + """ |
| 94 | + def __init__(self, fd): |
| 95 | + self._fd = fd |
| 96 | + self._line = 0 |
| 97 | + |
| 98 | + def line(self): |
| 99 | + return self._line |
| 100 | + |
| 101 | + def readline(self): |
| 102 | + self._line += 1 |
| 103 | + return self._fd.readline() |
| 104 | + |
| 105 | + def filename(self): |
| 106 | + return os.path.abspath(self._fd.name) |
| 107 | + |
| 108 | + |
| 109 | +class ParseError(Exception): |
| 110 | + """ |
| 111 | + Error raised when a parsing error is encountered. |
| 112 | + """ |
| 113 | + def __init__(self, msg, ctx): |
| 114 | + self.msg = msg |
| 115 | + self.line = ctx.line() |
| 116 | + self.filename = ctx.filename() |
| 117 | + |
| 118 | + def __str__(self): |
| 119 | + return " ".join([str(self.msg), 'at line', str(self.line), 'in', |
| 120 | + str(self.filename)]) |
| 121 | + |
| 122 | + |
| 123 | +def parse_file(filepath): |
| 124 | + """ |
| 125 | + Convenience function taking a file path instead of a file descriptor. |
| 126 | + """ |
| 127 | + with open(filepath) as fd: |
| 128 | + return parse(fd) |
| 129 | + |
| 130 | + |
| 131 | +def parse(fd): |
| 132 | + """ |
| 133 | + Parse an already opened massif output file. |
| 134 | + """ |
| 135 | + mdata = {} |
| 136 | + ctx = ParseContext(fd) |
| 137 | + _parse_header(ctx, mdata) |
| 138 | + _parse_snapshots(ctx, mdata) |
| 139 | + return mdata |
| 140 | + |
| 141 | + |
| 142 | +def _match_unconditional(ctx, regex, string): |
| 143 | + """ |
| 144 | + Unconditionaly match a regular expression against a string, i.e. if there |
| 145 | + is no match we raise a ParseError. |
| 146 | + """ |
| 147 | + match = regex.match(string) |
| 148 | + if match is None: |
| 149 | + raise ParseError("".join(["can't match '", string, "' against '", |
| 150 | + regex.pattern, "'"]), ctx) |
| 151 | + return match |
| 152 | + |
| 153 | + |
| 154 | +def _get_next_line(ctx, may_reach_eof=False): |
| 155 | + """ |
| 156 | + Read another line from ctx. If may_reach_eof is False, reaching EOF will |
| 157 | + be considered as an error. |
| 158 | + """ |
| 159 | + line = ctx.readline() # Returns an empty string on EOF. |
| 160 | + |
| 161 | + if len(line) == 0: |
| 162 | + if may_reach_eof is False: |
| 163 | + raise ParseError("unexpected EOF", ctx) |
| 164 | + else: |
| 165 | + return None |
| 166 | + else: |
| 167 | + return line.strip("\n") |
| 168 | + |
| 169 | + |
| 170 | +def _get_next_field(ctx, field_regex, may_reach_eof=False): |
| 171 | + """ |
| 172 | + Read the next data field. The field_regex arg is a regular expression that |
| 173 | + will be used to match the field. Data will be extracted from the match |
| 174 | + object by calling m.group('data'). If may_reach_eof is False, reaching EOF |
| 175 | + will be considered as an error. |
| 176 | + """ |
| 177 | + line = _get_next_line(ctx, may_reach_eof) |
| 178 | + while line is not None: |
| 179 | + if _COMMENT_RE.match(line): |
| 180 | + line = _get_next_line(ctx, may_reach_eof) |
| 181 | + else: |
| 182 | + match = _match_unconditional(ctx, field_regex, line) |
| 183 | + return match.group("data") |
| 184 | + |
| 185 | + return None |
| 186 | + |
| 187 | + |
| 188 | +def _parse_header(ctx, mdata): |
| 189 | + mdata["desc"] = _get_next_field(ctx, _FIELD_DESC_RE) |
| 190 | + mdata["cmd"] = _get_next_field(ctx, _FIELD_CMD_RE) |
| 191 | + mdata["time_unit"] = _get_next_field(ctx, _FIELD_TIME_UNIT_RE) |
| 192 | + |
| 193 | + |
| 194 | +def _parse_snapshots(ctx, mdata): |
| 195 | + index = 0 |
| 196 | + snapshots = [] |
| 197 | + detailed_snapshot_indices = [] |
| 198 | + peak_snapshot_index = None |
| 199 | + |
| 200 | + snapshot = _parse_snapshot(ctx) |
| 201 | + |
| 202 | + while snapshot is not None: |
| 203 | + if snapshot["is_detailed"]: |
| 204 | + detailed_snapshot_indices.append(index) |
| 205 | + if snapshot["is_peak"]: |
| 206 | + peak_snapshot_index = index |
| 207 | + snapshots.append(snapshot["data"]) |
| 208 | + snapshot = _parse_snapshot(ctx) |
| 209 | + index += 1 |
| 210 | + |
| 211 | + mdata["snapshots"] = snapshots |
| 212 | + mdata["detailed_snapshot_indices"] = detailed_snapshot_indices |
| 213 | + |
| 214 | + if peak_snapshot_index is not None: |
| 215 | + mdata["peak_snapshot_index"] = peak_snapshot_index |
| 216 | + |
| 217 | + |
| 218 | +def _parse_snapshot(ctx): |
| 219 | + """ |
| 220 | + Parse another snapshot, appending it to the mdata["snapshots"] list. On |
| 221 | + EOF, False will be returned. |
| 222 | + """ |
| 223 | + snapshot_id = _get_next_field(ctx, _FIELD_SNAPSHOT_RE, may_reach_eof=True) |
| 224 | + |
| 225 | + if snapshot_id is None: |
| 226 | + return None |
| 227 | + |
| 228 | + snapshot_id = int(snapshot_id) |
| 229 | + time = int(_get_next_field(ctx, _FIELD_TIME_RE)) |
| 230 | + mem_heap = int(_get_next_field(ctx, _FIELD_MEM_HEAP_RE)) |
| 231 | + mem_heap_extra = int(_get_next_field(ctx, _FIELD_MEM_EXTRA_RE)) |
| 232 | + mem_stacks = int(_get_next_field(ctx, _FIELD_MEM_STACK_RE)) |
| 233 | + heap_tree_field = _get_next_field(ctx, _FIELD_HEAP_TREE_RE) |
| 234 | + |
| 235 | + heap_tree = None |
| 236 | + is_detailed = False |
| 237 | + is_peak = False |
| 238 | + |
| 239 | + if heap_tree_field != "empty": |
| 240 | + is_detailed = True |
| 241 | + if heap_tree_field == "peak": |
| 242 | + is_peak = True |
| 243 | + heap_tree = _parse_heap_tree(ctx) |
| 244 | + |
| 245 | + return { |
| 246 | + "is_detailed": is_detailed, |
| 247 | + "is_peak": is_peak, |
| 248 | + "data": { |
| 249 | + "id": snapshot_id, |
| 250 | + "time": time, |
| 251 | + "mem_heap": mem_heap, |
| 252 | + "mem_heap_extra": mem_heap_extra, |
| 253 | + "mem_stack": mem_stacks, |
| 254 | + "heap_tree": heap_tree |
| 255 | + } |
| 256 | + } |
| 257 | + |
| 258 | + |
| 259 | +def _parse_heap_tree(ctx): |
| 260 | + """ |
| 261 | + Parse a heap tree. |
| 262 | + """ |
| 263 | + line = _get_next_line(ctx) |
| 264 | + |
| 265 | + entry_match = _match_unconditional(ctx, _HEAP_ENTRY_RE, line) |
| 266 | + details_group = entry_match.group("details") |
| 267 | + |
| 268 | + details = None |
| 269 | + details_match = _HEAP_DETAILS_RE.match(details_group) |
| 270 | + |
| 271 | + if details_match: |
| 272 | + # The 'line' field could be None if the binary/library wasn't compiled |
| 273 | + # with debug info. To avoid errors on this condition, we need to make |
| 274 | + # sure that the 'line' field is not None before trying to convert it to |
| 275 | + # an integer. |
| 276 | + linum = details_match.group(4) |
| 277 | + if linum is not None: |
| 278 | + linum = int(linum) |
| 279 | + |
| 280 | + details = { |
| 281 | + "address": details_match.group("address"), |
| 282 | + "function": details_match.group("function"), |
| 283 | + "file": details_match.group("fname"), |
| 284 | + "line": linum |
| 285 | + } |
| 286 | + |
| 287 | + children = [] |
| 288 | + for i in range(0, int(entry_match.group("num_children"))): |
| 289 | + children.append(_parse_heap_tree(ctx)) |
| 290 | + |
| 291 | + heap_node = {} |
| 292 | + heap_node["nbytes"] = int(entry_match.group("num_bytes")) |
| 293 | + heap_node["children"] = children |
| 294 | + heap_node["details"] = details |
| 295 | + |
| 296 | + return heap_node |
0 commit comments