3 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
4 # Use of this source code is governed by a BSD-style license that can be
5 # found in the LICENSE file.
7 """Traces an executable and its child processes and extract the files accessed
10 The implementation uses OS-specific API. The native Kernel logger and the ETL
11 interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
12 The OS-specific implementation is hidden in an 'API' interface.
14 The results are embedded in a Results instance. The tracing is done in two
15 phases, the first is to do the actual trace and generate an
16 implementation-specific log file. Then the log file is parsed to extract the
17 information, including the individual child processes and the files accessed
38 ## OS-specific imports
40 if sys.platform == 'win32':
41 from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
42 from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
43 from ctypes.wintypes import GetLastError # pylint: disable=E0611
44 elif sys.platform == 'darwin':
45 import Carbon.File # pylint: disable=F0401
46 import MacOS # pylint: disable=F0401
49 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
50 ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
53 class TracingFailure(Exception):
54 """An exception occured during tracing."""
55 def __init__(self, description, pid, line_number, line, *args):
56 super(TracingFailure, self).__init__(
57 description, pid, line_number, line, *args)
58 self.description = description
60 self.line_number = line_number
65 out = self.description
67 out += '\npid: %d' % self.pid
69 out += '\nline: %d' % self.line_number
71 out += '\n%s' % self.line
73 out += '\n' + ', '.join(map(str, filter(None, self.extra)))
77 ## OS-specific functions
79 if sys.platform == 'win32':
80 def QueryDosDevice(drive_letter):
81 """Returns the Windows 'native' path for a DOS drive letter."""
82 assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
83 assert isinstance(drive_letter, unicode)
84 # Guesswork. QueryDosDeviceW never returns the required number of bytes.
86 drive_letter = drive_letter
87 p = create_unicode_buffer(chars)
88 if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
91 # pylint: disable=E0602
92 msg = u'QueryDosDevice(%s): %s (%d)' % (
93 drive_letter, FormatError(err), err)
94 raise WindowsError(err, msg.encode('utf-8'))
98 def GetShortPathName(long_path):
99 """Returns the Windows short path equivalent for a 'long' path."""
100 assert isinstance(long_path, unicode)
101 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
103 if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
104 long_path = '\\\\?\\' + long_path
105 chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
107 p = create_unicode_buffer(chars)
108 if windll.kernel32.GetShortPathNameW(long_path, p, chars):
113 # pylint: disable=E0602
114 msg = u'GetShortPathName(%s): %s (%d)' % (
115 long_path, FormatError(err), err)
116 raise WindowsError(err, msg.encode('utf-8'))
119 def GetLongPathName(short_path):
120 """Returns the Windows long path equivalent for a 'short' path."""
121 assert isinstance(short_path, unicode)
122 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
124 if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
125 short_path = '\\\\?\\' + short_path
126 chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
128 p = create_unicode_buffer(chars)
129 if windll.kernel32.GetLongPathNameW(short_path, p, chars):
134 # pylint: disable=E0602
135 msg = u'GetLongPathName(%s): %s (%d)' % (
136 short_path, FormatError(err), err)
137 raise WindowsError(err, msg.encode('utf-8'))
140 def get_current_encoding():
141 """Returns the 'ANSI' code page associated to the process."""
142 return 'cp%d' % int(windll.kernel32.GetACP())
145 class DosDriveMap(object):
146 """Maps \Device\HarddiskVolumeN to N: on Windows."""
147 # Keep one global cache.
151 """Lazy loads the cache."""
152 if not self._MAPPING:
153 # This is related to UNC resolver on windows. Ignore that.
154 self._MAPPING[u'\\Device\\Mup'] = None
155 self._MAPPING[u'\\SystemRoot'] = os.environ[u'SystemRoot']
157 for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
159 letter = u'%s:' % letter
160 mapped = QueryDosDevice(letter)
161 if mapped in self._MAPPING:
163 ('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
164 '. Drive letters are a user-mode concept and the kernel '
165 'traces only have NT path, so all accesses will be '
166 'associated with the first drive letter, independent of the '
167 'actual letter used by the code') % (
168 self._MAPPING[mapped], letter))
170 self._MAPPING[mapped] = letter
171 except WindowsError: # pylint: disable=E0602
174 def to_win32(self, path):
175 """Converts a native NT path to Win32/DOS compatible path."""
176 match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
179 'Can\'t convert %s into a Win32 compatible path' % path,
181 if not match.group(1) in self._MAPPING:
182 # Unmapped partitions may be accessed by windows for the
183 # fun of it while the test is running. Discard these.
185 drive = self._MAPPING[match.group(1)]
186 if not drive or not match.group(2):
188 return drive + match.group(2)
192 """Accepts X: as an absolute path, unlike python's os.path.isabs()."""
193 return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
196 def get_native_path_case(p):
197 """Returns the native path case for an existing file.
199 On Windows, removes any leading '\\?\'.
203 'get_native_path_case(%r): Require an absolute path' % p, p)
208 # This means it has an alternate-data stream. There could be 3 ':', since
209 # it could be the $DATA datastream of an ADS. Split the whole ADS suffix
210 # off and add it back afterward. There is no way to know the native path
211 # case of an alternate data stream.
213 p = ':'.join(items[0:2])
214 suffix = ''.join(':' + i for i in items[2:])
216 # Windows used to have an option to turn on case sensitivity on non Win32
217 # subsystem but that's out of scope here and isn't supported anymore.
218 # Go figure why GetShortPathName() is needed.
220 out = GetLongPathName(GetShortPathName(p))
222 if e.args[0] in (2, 3, 5):
223 # The path does not exist. Try to recurse and reconstruct the path.
224 base = os.path.dirname(p)
225 rest = os.path.basename(p)
226 return os.path.join(get_native_path_case(base), rest)
228 if out.startswith('\\\\?\\'):
230 # Always upper case the first letter since GetLongPathName() will return the
231 # drive letter in the case it was given.
232 return out[0].upper() + out[1:] + suffix
235 def CommandLineToArgvW(command_line):
236 """Splits a commandline into argv using CommandLineToArgvW()."""
237 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
239 assert isinstance(command_line, unicode)
240 ptr = windll.shell32.CommandLineToArgvW(command_line, byref(size))
242 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
244 windll.kernel32.LocalFree(ptr)
247 elif sys.platform == 'darwin':
250 # On non-windows, keep the stdlib behavior.
251 isabs = os.path.isabs
254 def _find_item_native_case(root_path, item):
255 """Gets the native path case of a single item based at root_path.
257 There is no API to get the native path case of symlinks on OSX. So it
258 needs to be done the slow way.
261 for element in os.listdir(root_path):
262 if element.lower() == item:
267 """Gets the native path case. Warning: this function resolves symlinks."""
269 rel_ref, _ = Carbon.File.FSPathMakeRef(p.encode('utf-8'))
270 # The OSX underlying code uses NFD but python strings are in NFC. This
271 # will cause issues with os.listdir() for example. Since the dtrace log
272 # *is* in NFC, normalize it here.
273 out = unicodedata.normalize(
274 'NFC', rel_ref.FSRefMakePath().decode('utf-8'))
275 if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
276 return out + os.path.sep
278 except MacOS.Error, e:
279 if e.args[0] in (-43, -120):
280 # The path does not exist. Try to recurse and reconstruct the path.
281 # -43 means file not found.
282 # -120 means directory not found.
283 base = os.path.dirname(p)
284 rest = os.path.basename(p)
285 return os.path.join(_native_case(base), rest)
287 e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
290 def _split_at_symlink_native(base_path, rest):
291 """Returns the native path for a symlink."""
292 base, symlink, rest = split_at_symlink(base_path, rest)
297 base_path = safe_join(base_path, base)
298 symlink = _find_item_native_case(base_path, symlink)
299 return base, symlink, rest
302 def get_native_path_case(path):
303 """Returns the native path case for an existing file.
305 Technically, it's only HFS+ on OSX that is case preserving and
306 insensitive. It's the default setting on HFS+ but can be changed.
308 assert isinstance(path, unicode), path
311 'Can\'t get native path case for a non-absolute path: %s' % path,
313 if path.startswith('/dev'):
314 # /dev is not visible from Carbon, causing an exception.
317 # Starts assuming there is no symlink along the path.
318 resolved = _native_case(path)
319 if resolved.lower() == path.lower():
320 # This code path is incredibly faster.
321 logging.debug('get_native_path_case(%s) = %s' % (path, resolved))
324 # There was a symlink, process it.
325 base, symlink, rest = _split_at_symlink_native(None, path)
326 assert symlink, (path, base, symlink, rest, resolved)
328 base = safe_join(_native_case(base), symlink)
329 assert len(base) > len(prev)
332 relbase, symlink, rest = _split_at_symlink_native(base, rest)
333 base = safe_join(base, relbase)
334 assert len(base) > len(prev), (prev, base, symlink)
336 base = safe_join(base, symlink)
337 assert len(base) > len(prev), (prev, base, symlink)
338 # Make sure no symlink was resolved.
339 assert base.lower() == path.lower(), (base, path)
340 logging.debug('get_native_path_case(%s) = %s' % (path, base))
344 else: # OSes other than Windows and OSX.
347 # On non-windows, keep the stdlib behavior.
348 isabs = os.path.isabs
351 def get_native_path_case(path):
352 """Returns the native path case for an existing file.
354 On OSes other than OSX and Windows, assume the file system is
357 TODO(maruel): This is not strictly true. Implement if necessary.
361 'Can\'t get native path case for a non-absolute path: %s' % path,
363 # Give up on cygwin, as GetLongPathName() can't be called.
364 # Linux traces tends to not be normalized so use this occasion to normalize
365 # it. This function implementation already normalizes the path on the other
366 # OS so this needs to be done here to be coherent between OSes.
367 out = os.path.normpath(path)
368 if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
369 return out + os.path.sep
373 if sys.platform != 'win32': # All non-Windows OSes.
376 def safe_join(*args):
377 """Joins path elements like os.path.join() but doesn't abort on absolute
380 os.path.join('foo', '/bar') == '/bar'
381 but safe_join('foo', '/bar') == 'foo/bar'.
385 if element.startswith(os.path.sep):
386 if out.endswith(os.path.sep):
391 if out.endswith(os.path.sep):
394 out += os.path.sep + element
398 def split_at_symlink(base_dir, relfile):
399 """Scans each component of relfile and cut the string at the symlink if
402 Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
403 not symlink was found.
407 assert os.path.isabs(base_dir)
410 assert os.path.isabs(relfile)
415 return safe_join(base_dir, rest)
420 index = relfile.index(os.path.sep, index)
423 full = at_root(relfile[:index])
424 if os.path.islink(full):
426 base = os.path.dirname(relfile[:index])
427 symlink = os.path.basename(relfile[:index])
428 rest = relfile[index:]
430 'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
431 (base_dir, relfile, base, symlink, rest))
432 return base, symlink, rest
433 if index == len(relfile):
436 return relfile, None, None
439 def fix_python_path(cmd):
440 """Returns the fixed command line to call the right python executable."""
442 if out[0] == 'python':
443 out[0] = sys.executable
444 elif out[0].endswith('.py'):
445 out.insert(0, sys.executable)
450 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
454 'import subprocess\n'
456 'sys.exit(subprocess.call(sys.argv[2:]))\n'
462 def strace_process_quoted_arguments(text):
463 """Extracts quoted arguments on a string and return the arguments as a list.
465 Implemented as an automaton. Supports incomplete strings in the form
469 With text = '"foo", "bar"', the function will return ['foo', 'bar']
471 TODO(maruel): Implement escaping.
473 # All the possible states of the DFA.
474 ( NEED_QUOTE, # Begining of a new arguments.
475 INSIDE_STRING, # Inside an argument.
476 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
477 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
478 # a serie of 3 dots or a comma.
479 NEED_SPACE, # Right after a comma
480 NEED_DOT_2, # Found a dot, need a second one.
481 NEED_DOT_3, # Found second dot, need a third one.
482 NEED_COMMA, # Found third dot, need a comma.
487 for index, char in enumerate(text):
489 if state == NEED_QUOTE:
490 state = INSIDE_STRING
491 # A new argument was found.
493 elif state == INSIDE_STRING:
494 # The argument is now closed.
495 state = NEED_COMMA_OR_DOT
496 elif state == ESCAPED:
498 state = INSIDE_STRING
501 'Can\'t process char at column %d for: %r' % (index, text),
505 if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
507 elif state == INSIDE_STRING:
509 elif state == ESCAPED:
511 state = INSIDE_STRING
514 'Can\'t process char at column %d for: %r' % (index, text),
518 if state == NEED_SPACE:
520 elif state == INSIDE_STRING:
522 elif state == ESCAPED:
524 state = INSIDE_STRING
527 'Can\'t process char at column %d for: %r' % (index, text),
531 if state == NEED_COMMA_OR_DOT:
532 # The string is incomplete, this mean the strace -s flag should be
535 elif state == NEED_DOT_2:
537 elif state == NEED_DOT_3:
539 elif state == INSIDE_STRING:
541 elif state == ESCAPED:
543 state = INSIDE_STRING
546 'Can\'t process char at column %d for: %r' % (index, text),
552 state = INSIDE_STRING
553 elif state == INSIDE_STRING:
557 'Can\'t process char at column %d for: %r' % (index, text),
561 if state == INSIDE_STRING:
565 'Can\'t process char at column %d for: %r' % (index, text),
568 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
570 'String is incorrectly terminated: %r' % text,
575 def read_json(filepath):
576 with open(filepath, 'r') as f:
580 def write_json(filepath_or_handle, data, dense):
581 """Writes data into filepath or file handle encoded as json.
583 If dense is True, the json is packed. Otherwise, it is human readable.
585 if hasattr(filepath_or_handle, 'write'):
587 filepath_or_handle.write(
588 json.dumps(data, sort_keys=True, separators=(',',':')))
590 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
592 with open(filepath_or_handle, 'wb') as f:
594 json.dump(data, f, sort_keys=True, separators=(',',':'))
596 json.dump(data, f, sort_keys=True, indent=2)
599 def assert_is_renderable(pseudo_string):
600 """Asserts the input is a valid object to be processed by render()."""
602 isinstance(pseudo_string, (None.__class__, unicode)) or
603 hasattr(pseudo_string, 'render')), repr(pseudo_string)
606 def render(pseudo_string):
607 """Converts the pseudo-string to an unicode string."""
608 assert_is_renderable(pseudo_string)
609 if isinstance(pseudo_string, (None.__class__, unicode)):
611 return pseudo_string.render()
614 class Results(object):
615 """Results of a trace session."""
617 class _TouchedObject(object):
618 """Something, a file or a directory, that was accessed."""
619 def __init__(self, root, path, tainted, size, nb_files):
621 '%s(%s, %s, %s, %s, %s)' %
622 (self.__class__.__name__, root, path, tainted, size, nb_files))
623 assert_is_renderable(root)
624 assert_is_renderable(path)
627 self.tainted = tainted
628 self.nb_files = nb_files
629 # Can be used as a cache or a default value, depending on context.
631 # These are cache only.
632 self._real_path = None
634 # Check internal consistency.
636 assert tainted or bool(root) != bool(isabs(path)), (root, path)
638 not os.path.exists(self.full_path) or
639 (self.full_path == get_native_path_case(self.full_path))), (
640 tainted, self.full_path, get_native_path_case(self.full_path))
644 return self.size != -1
649 return os.path.join(self.root, self.path)
654 """Returns the path with symlinks resolved."""
655 if not self._real_path:
656 self._real_path = os.path.realpath(self.full_path)
657 return self._real_path
661 """File's size. -1 is not existent."""
662 if self._size is None and not self.tainted:
664 self._size = os.stat(self.full_path).st_size
670 """Returns a dict representing this object.
672 A 'size' of 0 means the file was only touched and not read.
679 def replace_variables(self, variables):
680 """Replaces the root of this File with one of the variables if it matches.
682 If a variable replacement occurs, the cloned object becomes tainted.
684 for variable, root_path in variables.iteritems():
685 if self.path.startswith(root_path):
687 self.root, variable + self.path[len(root_path):], True)
688 # No need to clone, returns ourself.
691 def strip_root(self, root):
692 """Returns a clone of itself with 'root' stripped off.
694 Note that the file is kept if it is either accessible from a symlinked
695 path that was used to access the file or through the real path.
697 # Check internal consistency.
698 assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
699 if not self.full_path.startswith(root):
700 # Now try to resolve the symlinks to see if it can be reached this way.
701 # Only try *after* trying without resolving symlink.
702 if not self.real_path.startswith(root):
704 path = self.real_path
706 path = self.full_path
707 return self._clone(root, path[len(root):], self.tainted)
709 def _clone(self, new_root, new_path, tainted):
710 raise NotImplementedError(self.__class__.__name__)
712 class File(_TouchedObject):
713 """A file that was accessed. May not be present anymore.
715 If tainted is true, it means it is not a real path anymore as a variable
718 If touched_only is True, this means the file was probed for existence, and
719 it is existent, but was never _opened_. If touched_only is True, the file
722 def __init__(self, root, path, tainted, size):
723 super(Results.File, self).__init__(root, path, tainted, size, 1)
725 def _clone(self, new_root, new_path, tainted):
726 """Clones itself keeping meta-data."""
727 # Keep the self.size and self._real_path caches for performance reason. It
728 # is also important when the file becomes tainted (with a variable instead
729 # of the real path) since self.path is not an on-disk path anymore so
730 # out._size cannot be updated.
731 out = self.__class__(new_root, new_path, tainted, self.size)
732 out._real_path = self._real_path
735 class Directory(_TouchedObject):
736 """A directory of files. Must exist."""
737 def __init__(self, root, path, tainted, size, nb_files):
738 """path='.' is a valid value and must be handled appropriately."""
739 assert not path.endswith(os.path.sep), path
740 super(Results.Directory, self).__init__(
741 root, path + os.path.sep, tainted, size, nb_files)
742 # For a Directory instance, self.size is not a cache, it's an actual value
743 # that is never modified and represents the total size of the files
744 # contained in this directory. It is possible that the directory is empty
745 # so that size == 0; this happens if there's only an invalid symlink in
749 out = super(Results.Directory, self).flatten()
750 out['nb_files'] = self.nb_files
753 def _clone(self, new_root, new_path, tainted):
754 """Clones itself keeping meta-data."""
755 out = self.__class__(
757 new_path.rstrip(os.path.sep),
761 out._real_path = self._real_path
764 class Process(object):
765 """A process that was traced.
767 Contains references to the files accessed by this process and its children.
769 def __init__(self, pid, files, executable, command, initial_cwd, children):
770 logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
772 self.files = sorted(files, key=lambda x: x.path)
773 self.children = children
774 self.executable = executable
775 self.command = command
776 self.initial_cwd = initial_cwd
778 # Check internal consistency.
779 assert len(set(f.path for f in self.files)) == len(self.files), sorted(
780 f.path for f in self.files)
781 assert isinstance(self.children, list)
782 assert isinstance(self.files, list)
786 for child in self.children:
793 'children': [c.flatten() for c in self.children],
794 'command': self.command,
795 'executable': self.executable,
796 'files': [f.flatten() for f in self.files],
797 'initial_cwd': self.initial_cwd,
801 def strip_root(self, root):
802 assert isabs(root) and root.endswith(os.path.sep), root
803 # Loads the files after since they are constructed as objects.
804 out = self.__class__(
806 filter(None, (f.strip_root(root) for f in self.files)),
810 [c.strip_root(root) for c in self.children])
812 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
815 def __init__(self, process):
816 self.process = process
822 'root': self.process.flatten(),
827 if self._files is None:
828 self._files = sorted(
829 sum((p.files for p in self.process.all), []),
830 key=lambda x: x.path)
835 return [f for f in self.files if f.existent]
838 def non_existent(self):
839 return [f for f in self.files if not f.existent]
841 def strip_root(self, root):
842 """Returns a clone with all the files outside the directory |root| removed
843 and converts all the path to be relative paths.
845 It keeps files accessible through the |root| directory or that have been
846 accessed through any symlink which points to the same directory.
848 # Resolve any symlink
849 root = os.path.realpath(root)
850 root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
851 logging.debug('strip_root(%s)' % root)
852 return Results(self.process.strip_root(root))
855 class ApiBase(object):
856 """OS-agnostic API to trace a process and its children."""
857 class Context(object):
858 """Processes one log line at a time and keeps the list of traced processes.
860 The parsing is complicated by the fact that logs are traced out of order for
861 strace but in-order for dtrace and logman. In addition, on Windows it is
862 very frequent that processids are reused so a flat list cannot be used. But
863 at the same time, it is impossible to faithfully construct a graph when the
864 logs are processed out of order. So both a tree and a flat mapping are used,
865 the tree is the real process tree, while the flat mapping stores the last
866 valid process for the corresponding processid. For the strace case, the
867 tree's head is guessed at the last moment.
869 class Process(object):
870 """Keeps context for one traced child process.
872 Logs all the files this process touched. Ignores directories.
874 def __init__(self, blacklist, pid, initial_cwd):
875 # Check internal consistency.
876 assert isinstance(pid, int), repr(pid)
877 assert_is_renderable(initial_cwd)
879 # children are Process instances.
881 self.initial_cwd = initial_cwd
884 self.only_touched = set()
885 self.executable = None
887 self._blacklist = blacklist
889 def to_results_process(self):
890 """Resolves file case sensitivity and or late-bound strings."""
891 # When resolving files, it's normal to get dupe because a file could be
892 # opened multiple times with different case. Resolve the deduplication
895 """Returns the native file path case.
897 Converts late-bound strings.
900 # Do not convert None instance to 'None'.
904 # If the path is not absolute, which tends to happen occasionally on
905 # Windows, it is not possible to get the native path case so ignore
906 # that trace. It mostly happens for 'executable' value.
907 x = get_native_path_case(x)
910 def fix_and_blacklist_path(x):
914 # The blacklist needs to be reapplied, since path casing could
915 # influence blacklisting.
916 if self._blacklist(x):
920 # Filters out directories. Some may have passed through.
921 files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
923 f for f in map(fix_and_blacklist_path, self.only_touched) if f)
924 only_touched -= files
927 Results.File(None, f, False, None) for f in files
928 if not os.path.isdir(f)
930 # Using 0 as size means the file's content is ignored since the file was
931 # never opened for I/O.
933 Results.File(None, f, False, 0) for f in only_touched
934 if not os.path.isdir(f)
936 return Results.Process(
939 fix_path(self.executable),
941 fix_path(self.initial_cwd),
942 [c.to_results_process() for c in self.children])
944 def add_file(self, filepath, touch_only):
945 """Adds a file if it passes the blacklist."""
946 if self._blacklist(render(filepath)):
948 logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
949 # Note that filepath and not render(filepath) is added. It is
950 # because filepath could be something else than a string, like a
951 # RelativePath instance for dtrace logs.
953 self.only_touched.add(filepath)
955 self.files.add(filepath)
957 def __init__(self, blacklist):
958 self.blacklist = blacklist
960 self.root_process = None
961 # dict to accelerate process lookup, to not have to lookup the whole graph
963 self._process_lookup = {}
965 class Tracer(object):
966 """During it's lifetime, the tracing subsystem is enabled."""
967 def __init__(self, logname):
968 self._logname = logname
969 self._lock = threading.Lock()
971 self._initialized = True
973 def trace(self, cmd, cwd, tracename, output):
974 """Runs the OS-specific trace program on an executable.
977 - cmd: The command (a list) to run.
978 - cwd: Current directory to start the child process in.
979 - tracename: Name of the trace in the logname file.
980 - output: If False, redirects output to PIPEs.
982 Returns a tuple (resultcode, output) and updates the internal trace
985 # The implementation adds an item to self._traces.
986 raise NotImplementedError(self.__class__.__name__)
988 def close(self, _timeout=None):
989 """Saves the meta-data in the logname file.
991 For kernel-based tracing, stops the tracing subsystem.
993 Must not be used manually when using 'with' construct.
996 assert self._initialized
999 'traces': self._traces,
1001 write_json(self._logname, data, False)
1003 self._initialized = False
1005 def post_process_log(self):
1006 """Post-processes the log so it becomes faster to load afterward.
1008 Must not be used manually when using 'with' construct.
1010 assert not self._initialized, 'Must stop tracing first.'
1012 def __enter__(self):
1013 """Enables 'with' statement."""
1016 def __exit__(self, exc_type, exc_value, traceback):
1017 """Enables 'with' statement."""
1019 # If an exception was thrown, do not process logs.
1021 self.post_process_log()
1023 def get_tracer(self, logname):
1024 """Returns an ApiBase.Tracer instance.
1026 Initializes the tracing subsystem, which is a requirement for kernel-based
1027 tracers. Only one tracer instance should be live at a time!
1029 logname is the filepath to the json file that will contain the meta-data
1032 return self.Tracer(logname)
1035 def clean_trace(logname):
1036 """Deletes an old log."""
1037 raise NotImplementedError()
1040 def parse_log(cls, logname, blacklist, trace_name):
1041 """Processes trace logs and returns the files opened and the files that do
1044 It does not track directories.
1047 - logname: must be an absolute path.
1048 - blacklist: must be a lambda.
1049 - trace_name: optional trace to read, defaults to reading all traces.
1051 Most of the time, files that do not exist are temporary test files that
1052 should be put in /tmp instead. See http://crbug.com/116251.
1054 Returns a list of dict with keys:
1055 - results: A Results instance.
1056 - trace: The corresponding tracename parameter provided to
1057 get_tracer().trace().
1058 - output: Output gathered during execution, if get_tracer().trace(...,
1059 output=False) was used.
1061 raise NotImplementedError(cls.__class__.__name__)
1064 class Strace(ApiBase):
1065 """strace implies linux."""
1067 def load_filename(filename):
1068 """Parses a filename in a log."""
1069 assert isinstance(filename, str)
1072 while i < len(filename):
1075 out += chr(int(filename[i+1:i+4], 8))
1080 # TODO(maruel): That's not necessarily true that the current code page is
1082 return out.decode('utf-8')
1084 class Context(ApiBase.Context):
1085 """Processes a strace log line and keeps the list of existent and non
1086 existent files accessed.
1088 Ignores directories.
1090 Uses late-binding to processes the cwd of each process. The problem is that
1091 strace generates one log file per process it traced but doesn't give any
1092 information about which process was started when and by who. So we don't
1093 even know which process is the initial one. So process the logs out of
1094 order and use late binding with RelativePath to be able to deduce the
1095 initial directory of each process once all the logs are parsed.
1097 class Process(ApiBase.Context.Process):
1098 """Represents the state of a process.
1100 Contains all the information retrieved from the pid-specific log.
1102 # Function names are using ([a-z_0-9]+)
1103 # This is the most common format. function(args) = result
1104 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.+?)\)\s+= (.+)$')
1105 # An interrupted function call, only grab the minimal header.
1106 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
1107 # An interrupted function call, with the process exiting. It must be the
1108 # last line in the log.
1109 RE_UNFINISHED_EXIT = re.compile(
1110 r'^([^\(]+)(.*) \<unfinished \.\.\.\ exit status \d+>$')
1111 # An interrupted function call the hard way. Usually observed with futex()
1113 RE_INTERRUPTED_HARD = re.compile(r'^([^\(]+)\([A-Z0-9a-fx\,\_\|\{\} ]*$')
1114 # A resumed function call.
1115 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
1116 # A process received a signal.
1117 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
1118 # A process didn't handle a signal. Ignore any junk appearing before,
1119 # because the process was forcibly killed so it won't open any new file.
1120 RE_KILLED = re.compile(
1121 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
1122 # The process has exited.
1123 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
1124 # A call was canceled. Ignore any prefix.
1125 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
1126 # Happens when strace fails to even get the function name.
1127 UNNAMED_FUNCTION = '????'
1129 # Corner-case in python, a class member function decorator must not be
1131 def parse_args(regexp, expect_zero): # pylint: disable=E0213
1132 """Automatically convert the str 'args' into a list of processed
1136 - regexp is used to parse args.
1137 - expect_zero: one of True, False or None.
1138 - True: will check for result.startswith('0') first and will ignore
1139 the trace line completely otherwise. This is important because for
1140 many functions, the regexp will not process if the call failed.
1141 - False: will check for not result.startswith(('?', '-1')) for the
1142 same reason than with True.
1143 - None: ignore result.
1145 def meta_hook(function):
1146 assert function.__name__.startswith('handle_')
1147 def hook(self, args, result):
1148 if expect_zero is True and not result.startswith('0'):
1150 if expect_zero is False and result.startswith(('?', '-1')):
1152 match = re.match(regexp, args)
1154 raise TracingFailure(
1155 'Failed to parse %s(%s) = %s' %
1156 (function.__name__[len('handle_'):], args, result),
1158 return function(self, match.groups(), result)
1162 class RelativePath(object):
1163 """A late-bound relative path."""
1164 def __init__(self, parent, value):
1165 assert_is_renderable(parent)
1166 self.parent = parent
1167 assert isinstance(value, (None.__class__, str)), repr(value)
1168 self.value = Strace.load_filename(value) if value else value
1170 assert '\\' not in self.value, value
1171 assert '\\' not in self.value, (repr(value), repr(self.value))
1174 """Returns the current directory this instance is representing.
1176 This function is used to return the late-bound value.
1178 if self.value and self.value.startswith(u'/'):
1180 # TODO(maruel): This is wrong, we can't assert it is utf-8.
1182 parent = self.parent.render() if self.parent else u'<None>'
1184 return os.path.normpath(os.path.join(parent, self.value))
1187 def __init__(self, root, pid):
1188 """Keeps enough information to be able to guess the original process
1191 strace doesn't store which process was the initial process. So more
1192 information needs to be kept so the graph can be reconstructed from the
1195 logging.info('%s(%d)' % (self.__class__.__name__, pid))
1196 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
1197 assert isinstance(root, ApiBase.Context)
1198 self._root = weakref.ref(root)
1199 # The dict key is the function name of the pending call, like 'open'
1201 self._pending_calls = {}
1202 self._line_number = 0
1203 # Current directory when the process started.
1204 self.initial_cwd = self.RelativePath(self._root(), None)
1205 self.parentid = None
1209 """Returns the best known value of cwd."""
1210 return self.cwd or self.initial_cwd
1213 """Returns the string value of the RelativePath() object.
1215 Used by RelativePath. Returns the initial directory and not the
1216 current one since the current directory 'cwd' validity is time-limited.
1218 The validity is only guaranteed once all the logs are processed.
1220 return self.initial_cwd.render()
1222 def on_line(self, line):
1223 assert isinstance(line, str)
1224 self._line_number += 1
1226 raise TracingFailure(
1227 'Found a trace for a terminated process',
1230 if self.RE_SIGNAL.match(line):
1235 match = self.RE_KILLED.match(line)
1237 # Converts a '+++ killed by Foo +++' trace into an exit_group().
1238 self.handle_exit_group(match.group(1), None)
1241 match = self.RE_PROCESS_EXITED.match(line)
1243 # Converts a '+++ exited with 1 +++' trace into an exit_group()
1244 self.handle_exit_group(match.group(1), None)
1247 match = self.RE_UNFINISHED.match(line)
1249 if match.group(1) in self._pending_calls:
1250 raise TracingFailure(
1251 'Found two unfinished calls for the same function',
1253 self._pending_calls)
1254 self._pending_calls[match.group(1)] = (
1255 match.group(1) + match.group(2))
1259 self.RE_UNFINISHED_EXIT.match(line) or
1260 self.RE_INTERRUPTED_HARD.match(line))
1262 # The process died. No other line can be processed afterward.
1266 match = self.RE_UNAVAILABLE.match(line)
1268 # This usually means a process was killed and a pending call was
1270 # TODO(maruel): Look up the last exit_group() trace just above and
1271 # make sure any self._pending_calls[anything] is properly flushed.
1274 match = self.RE_RESUMED.match(line)
1276 if match.group(1) not in self._pending_calls:
1277 raise TracingFailure(
1278 'Found a resumed call that was not logged as unfinished',
1280 self._pending_calls)
1281 pending = self._pending_calls.pop(match.group(1))
1282 # Reconstruct the line.
1283 line = pending + match.group(2)
1285 match = self.RE_HEADER.match(line)
1287 raise TracingFailure(
1288 'Found an invalid line: %s' % line,
1290 if match.group(1) == self.UNNAMED_FUNCTION:
1293 # It's a valid line, handle it.
1294 handler = getattr(self, 'handle_%s' % match.group(1), None)
1296 self._handle_unknown(match.group(1), match.group(2), match.group(3))
1297 return handler(match.group(2), match.group(3))
1298 except TracingFailure, e:
1299 # Hack in the values since the handler could be a static function.
1302 e.line_number = self._line_number
1303 # Re-raise the modified exception.
1305 except (KeyError, NotImplementedError, ValueError), e:
1306 raise TracingFailure(
1307 'Trace generated a %s exception: %s' % (
1308 e.__class__.__name__, str(e)),
1314 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
1315 def handle_access(self, args, _result):
1316 self._handle_file(args[0], True)
1318 @parse_args(r'^\"(.+?)\"$', True)
1319 def handle_chdir(self, args, _result):
1321 self.cwd = self.RelativePath(self, args[0])
1322 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1324 def handle_clone(self, _args, result):
1325 """Transfers cwd."""
1326 if result.startswith(('?', '-1')):
1329 # Update the other process right away.
1330 childpid = int(result)
1331 child = self._root().get_or_set_proc(childpid)
1332 if child.parentid is not None or childpid in self.children:
1333 raise TracingFailure(
1334 'Found internal inconsitency in process lifetime detection '
1335 'during a clone() call',
1338 # Copy the cwd object.
1339 child.initial_cwd = self.get_cwd()
1340 child.parentid = self.pid
1341 # It is necessary because the logs are processed out of order.
1342 self.children.append(child)
1344 def handle_close(self, _args, _result):
1347 def handle_chmod(self, _args, _result):
1350 def handle_creat(self, _args, _result):
1351 # Ignore files created, since they didn't need to exist.
1354 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
1355 def handle_execve(self, args, _result):
1356 # Even if in practice execve() doesn't returns when it succeeds, strace
1357 # still prints '0' as the result.
1359 self._handle_file(filepath, False)
1360 self.executable = self.RelativePath(self.get_cwd(), filepath)
1361 self.command = strace_process_quoted_arguments(args[1])
1363 def handle_exit_group(self, _args, _result):
1367 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', True)
1368 def handle_faccessat(self, args, _results):
1369 if args[0] == 'AT_FDCWD':
1370 self._handle_file(args[1], True)
1372 raise Exception('Relative faccess not implemented.')
1374 def handle_fork(self, args, result):
1375 self._handle_unknown('fork', args, result)
1377 def handle_futex(self, _args, _result):
1380 def handle_getcwd(self, _args, _result):
1383 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1384 def handle_link(self, args, _result):
1385 self._handle_file(args[0], False)
1386 self._handle_file(args[1], False)
1388 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1389 def handle_lstat(self, args, _result):
1390 self._handle_file(args[0], True)
1392 def handle_mkdir(self, _args, _result):
1395 @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1396 def handle_open(self, args, _result):
1397 if 'O_DIRECTORY' in args[1]:
1399 self._handle_file(args[0], False)
1401 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1402 def handle_openat(self, args, _result):
1403 if 'O_DIRECTORY' in args[2]:
1405 if args[0] == 'AT_FDCWD':
1406 self._handle_file(args[1], False)
1408 # TODO(maruel): Implement relative open if necessary instead of the
1409 # AT_FDCWD flag, let's hope not since this means tracking all active
1410 # directory handles.
1411 raise Exception('Relative open via openat not implemented.')
1413 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
1414 def handle_readlink(self, args, _result):
1415 self._handle_file(args[0], False)
1417 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1418 def handle_rename(self, args, _result):
1419 self._handle_file(args[0], False)
1420 self._handle_file(args[1], False)
1422 def handle_rmdir(self, _args, _result):
1425 def handle_setxattr(self, _args, _result):
1428 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1429 def handle_stat(self, args, _result):
1430 self._handle_file(args[0], True)
1432 def handle_symlink(self, _args, _result):
1435 @parse_args(r'^\"(.+?)\", \d+', True)
1436 def handle_truncate(self, args, _result):
1437 self._handle_file(args[0], False)
1439 def handle_unlink(self, _args, _result):
1440 # In theory, the file had to be created anyway.
1443 def handle_unlinkat(self, _args, _result):
1444 # In theory, the file had to be created anyway.
1447 def handle_statfs(self, _args, _result):
1450 def handle_utimensat(self, _args, _result):
1453 def handle_vfork(self, args, result):
1454 self._handle_unknown('vfork', args, result)
1457 def _handle_unknown(function, args, result):
1458 raise TracingFailure(
1459 'Unexpected/unimplemented trace %s(%s)= %s' %
1460 (function, args, result),
1463 def _handle_file(self, filepath, touch_only):
1464 filepath = self.RelativePath(self.get_cwd(), filepath)
1465 #assert not touch_only, render(filepath)
1466 self.add_file(filepath, touch_only)
1468 def __init__(self, blacklist, initial_cwd):
1469 super(Strace.Context, self).__init__(blacklist)
1470 assert_is_renderable(initial_cwd)
1471 self.initial_cwd = initial_cwd
1474 """Returns the string value of the initial cwd of the root process.
1476 Used by RelativePath.
1478 return self.initial_cwd
1480 def on_line(self, pid, line):
1481 """Transfers control into the Process.on_line() function."""
1482 self.get_or_set_proc(pid).on_line(line.strip())
1484 def to_results(self):
1485 """Finds back the root process and verify consistency."""
1486 # TODO(maruel): Absolutely unecessary, fix me.
1487 root = [p for p in self._process_lookup.itervalues() if not p.parentid]
1489 raise TracingFailure(
1490 'Found internal inconsitency in process lifetime detection '
1491 'while finding the root process',
1495 sorted(p.pid for p in root))
1496 self.root_process = root[0]
1497 process = self.root_process.to_results_process()
1498 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1499 raise TracingFailure(
1500 'Found internal inconsitency in process lifetime detection '
1501 'while looking for len(tree) == len(list)',
1505 sorted(self._process_lookup),
1506 sorted(p.pid for p in process.all))
1507 return Results(process)
1509 def get_or_set_proc(self, pid):
1510 """Returns the Context.Process instance for this pid or creates a new one.
1512 if not pid or not isinstance(pid, int):
1513 raise TracingFailure(
1514 'Unpexpected value for pid: %r' % pid,
1519 if pid not in self._process_lookup:
1520 self._process_lookup[pid] = self.Process(self, pid)
1521 return self._process_lookup[pid]
1525 """Returns the list of all handled traces to pass this as an argument to
1529 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
1531 class Tracer(ApiBase.Tracer):
1534 def trace(self, cmd, cwd, tracename, output):
1535 """Runs strace on an executable."""
1536 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
1537 assert os.path.isabs(cmd[0]), cmd[0]
1538 assert os.path.isabs(cwd), cwd
1539 assert os.path.normpath(cwd) == cwd, cwd
1541 if not self._initialized:
1542 raise TracingFailure(
1543 'Called Tracer.trace() on an unitialized object',
1544 None, None, None, tracename)
1545 assert tracename not in (i['trace'] for i in self._traces)
1546 stdout = stderr = None
1548 stdout = subprocess.PIPE
1549 stderr = subprocess.STDOUT
1550 # Ensure all file related APIs are hooked.
1551 traces = ','.join(Strace.Context.traces() + ['file'])
1555 '-s', '%d' % self.MAX_LEN,
1556 '-e', 'trace=%s' % traces,
1557 '-o', self._logname + '.' + tracename,
1559 child = subprocess.Popen(
1562 stdin=subprocess.PIPE,
1565 out = child.communicate()[0]
1566 # TODO(maruel): Walk the logs and figure out the root process would
1567 # simplify parsing the logs a *lot*.
1569 assert tracename not in (i['trace'] for i in self._traces)
1570 self._traces.append(
1575 # The pid of strace process, not very useful.
1579 return child.returncode, out
1582 def clean_trace(logname):
1583 if os.path.isfile(logname):
1585 # Also delete any pid specific file from previous traces.
1586 for i in glob.iglob(logname + '.*'):
1587 if i.rsplit('.', 1)[1].isdigit():
1591 def parse_log(cls, logname, blacklist, trace_name):
1592 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
1593 assert os.path.isabs(logname)
1594 data = read_json(logname)
1596 for item in data['traces']:
1597 if trace_name and item['trace'] != trace_name:
1600 'output': item['output'],
1601 'trace': item['trace'],
1604 context = cls.Context(blacklist, item['cwd'])
1605 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
1606 pid = pidfile.rsplit('.', 1)[1]
1609 for line in open(pidfile, 'rb'):
1610 context.on_line(pid, line)
1611 result['results'] = context.to_results()
1612 except TracingFailure:
1613 result['exception'] = sys.exc_info()
1618 class Dtrace(ApiBase):
1619 """Uses DTrace framework through dtrace. Requires root access.
1623 dtruss can't be used because it has compatibility issues with python.
1625 Also, the pid->cwd handling needs to be done manually since OSX has no way to
1626 get the absolute path of the 'cwd' dtrace variable from the probe.
1628 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
1629 https://discussions.apple.com/thread/1980539. So resort to handling execve()
1632 errno is not printed in the log since this implementation currently only cares
1633 about files that were successfully opened.
1635 class Context(ApiBase.Context):
1636 # Format: index pid function(args)
1637 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')
1639 # Arguments parsing.
1640 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
1641 RE_CHDIR = re.compile(r'^\"(.+?)\"$')
1642 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
1643 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
1644 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
1645 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
1647 O_DIRECTORY = 0x100000
1649 class Process(ApiBase.Context.Process):
1650 def __init__(self, *args):
1651 super(Dtrace.Context.Process, self).__init__(*args)
1652 self.cwd = self.initial_cwd
1654 def __init__(self, blacklist, thunk_pid, initial_cwd):
1656 '%s(%d, %s)' % (self.__class__.__name__, thunk_pid, initial_cwd))
1657 super(Dtrace.Context, self).__init__(blacklist)
1658 assert isinstance(initial_cwd, unicode), initial_cwd
1659 # Process ID of the temporary script created by create_thunk().
1660 self._thunk_pid = thunk_pid
1661 self._initial_cwd = initial_cwd
1662 self._line_number = 0
1664 def on_line(self, line):
1665 assert isinstance(line, unicode), line
1666 self._line_number += 1
1667 match = self.RE_HEADER.match(line)
1669 raise TracingFailure(
1670 'Found malformed line: %s' % line,
1676 'handle_%s' % match.group(2).replace('-', '_'),
1677 self._handle_ignored)
1678 # It is guaranteed to succeed because of the regexp. Or at least I thought
1680 pid = int(match.group(1))
1682 return fn(pid, match.group(3))
1683 except TracingFailure, e:
1684 # Hack in the values since the handler could be a static function.
1687 e.line_number = self._line_number
1688 # Re-raise the modified exception.
1690 except (KeyError, NotImplementedError, ValueError), e:
1691 raise TracingFailure(
1692 'Trace generated a %s exception: %s' % (
1693 e.__class__.__name__, str(e)),
1699 def to_results(self):
1700 process = self.root_process.to_results_process()
1701 # Internal concistency check.
1702 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1703 raise TracingFailure(
1704 'Found internal inconsitency in process lifetime detection '
1705 'while looking for len(tree) == len(list)',
1709 sorted(self._process_lookup),
1710 sorted(p.pid for p in process.all))
1711 return Results(process)
1713 def handle_dtrace_BEGIN(self, _pid, args):
1714 if not self.RE_DTRACE_BEGIN.match(args):
1715 raise TracingFailure(
1716 'Found internal inconsitency in dtrace_BEGIN log line',
1719 def handle_proc_start(self, pid, args):
1722 The dtrace script already takes care of only tracing the processes that
1723 are child of the traced processes so there is no need to verify the
1726 if pid in self._process_lookup:
1727 raise TracingFailure(
1728 'Found internal inconsitency in proc_start: %d started two times' %
1731 match = self.RE_PROC_START.match(args)
1733 raise TracingFailure(
1734 'Failed to parse arguments: %s' % args,
1736 ppid = int(match.group(1))
1737 if ppid == self._thunk_pid and not self.root_process:
1738 proc = self.root_process = self.Process(
1739 self.blacklist, pid, self._initial_cwd)
1740 elif ppid in self._process_lookup:
1741 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
1742 self._process_lookup[ppid].children.append(proc)
1744 # Another process tree, ignore.
1746 self._process_lookup[pid] = proc
1748 'New child: %s -> %d cwd:%s' %
1749 (ppid, pid, render(proc.initial_cwd)))
1751 def handle_proc_exit(self, pid, _args):
1753 if pid in self._process_lookup:
1754 # self._thunk_pid is not traced itself and other traces run neither.
1755 self._process_lookup[pid].cwd = None
1757 def handle_execve(self, pid, args):
1758 """Sets the process' executable.
1760 TODO(maruel): Read command line arguments. See
1761 https://discussions.apple.com/thread/1980539 for an example.
1762 https://gist.github.com/1242279
1764 Will have to put the answer at http://stackoverflow.com/questions/7556249.
1767 if not pid in self._process_lookup:
1768 # Another process tree, ignore.
1770 match = self.RE_EXECVE.match(args)
1772 raise TracingFailure(
1773 'Failed to parse arguments: %r' % args,
1775 proc = self._process_lookup[pid]
1776 proc.executable = match.group(1)
1777 proc.command = self.process_escaped_arguments(match.group(3))
1778 if int(match.group(2)) != len(proc.command):
1779 raise TracingFailure(
1780 'Failed to parse execve() arguments: %s' % args,
1783 def handle_chdir(self, pid, args):
1785 if pid not in self._process_lookup:
1786 # Another process tree, ignore.
1788 cwd = self.RE_CHDIR.match(args).group(1)
1789 if not cwd.startswith('/'):
1790 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
1791 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
1793 logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
1795 self._process_lookup[pid].cwd = cwd2
1797 def handle_open_nocancel(self, pid, args):
1798 """Redirects to handle_open()."""
1799 return self.handle_open(pid, args)
1801 def handle_open(self, pid, args):
1802 if pid not in self._process_lookup:
1803 # Another process tree, ignore.
1805 match = self.RE_OPEN.match(args)
1807 raise TracingFailure(
1808 'Failed to parse arguments: %s' % args,
1810 flag = int(match.group(2), 16)
1811 if self.O_DIRECTORY & flag == self.O_DIRECTORY:
1812 # Ignore directories.
1814 self._handle_file(pid, match.group(1))
1816 def handle_rename(self, pid, args):
1817 if pid not in self._process_lookup:
1818 # Another process tree, ignore.
1820 match = self.RE_RENAME.match(args)
1822 raise TracingFailure(
1823 'Failed to parse arguments: %s' % args,
1825 self._handle_file(pid, match.group(1))
1826 self._handle_file(pid, match.group(2))
1828 def _handle_file(self, pid, filepath):
1829 if not filepath.startswith('/'):
1830 filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
1831 # We can get '..' in the path.
1832 filepath = os.path.normpath(filepath)
1833 # Sadly, still need to filter out directories here;
1834 # saw open_nocancel(".", 0, 0) = 0 lines.
1835 if os.path.isdir(filepath):
1837 self._process_lookup[pid].add_file(filepath, False)
1839 def handle_ftruncate(self, pid, args):
1840 """Just used as a signal to kill dtrace, ignoring."""
1844 def _handle_ignored(pid, args):
1845 """Is called for all the event traces that are not handled."""
1846 raise NotImplementedError('Please implement me')
1849 def process_escaped_arguments(text):
1850 """Extracts escaped arguments on a string and return the arguments as a
1853 Implemented as an automaton.
1856 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
1857 function will return ['python2.7', '-c', 'print("hi")]
1859 if not text.endswith('\\0'):
1860 raise ValueError('String is not null terminated: %r' % text, text)
1864 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
1868 if i == '\\' and not escaped:
1875 return [unescape(i) for i in text.split('\\001')]
1877 class Tracer(ApiBase.Tracer):
1878 # pylint: disable=C0301
1880 # To understand the following code, you'll want to take a look at:
1881 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
1882 # https://wikis.oracle.com/display/DTrace/Variables
1883 # http://docs.oracle.com/cd/E19205-01/820-4221/
1885 # 0. Dump all the valid probes into a text file. It is important, you
1886 # want to redirect into a file and you don't want to constantly 'sudo'.
1887 # $ sudo dtrace -l > probes.txt
1889 # 1. Count the number of probes:
1890 # $ wc -l probes.txt
1891 # 81823 # On OSX 10.7, including 1 header line.
1893 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
1894 # likes and skipping the header with NR>1:
1895 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
1909 # 3. List of valid probes:
1910 # $ grep syscall probes.txt | less
1911 # or use dtrace directly:
1912 # $ sudo dtrace -l -P syscall | less
1914 # trackedpid is an associative array where its value can be 0, 1 or 2.
1915 # 0 is for untracked processes and is the default value for items not
1916 # in the associative array.
1917 # 1 is for tracked processes.
1918 # 2 is for the script created by create_thunk() only. It is not tracked
1919 # itself but all its decendants are.
1921 # The script will kill itself only once waiting_to_die == 1 and
1922 # current_processes == 0, so that both getlogin() was called and that
1923 # all traced processes exited.
1925 # TODO(maruel): Use cacheable predicates. See
1926 # https://wikis.oracle.com/display/DTrace/Performance+Considerations
1930 current_processes = 0;
1932 printf("%d %d %s_%s(\\"%s\\")\\n",
1933 logindex, PID, probeprov, probename, SCRIPT);
1937 proc:::start /trackedpid[ppid]/ {
1938 trackedpid[pid] = 1;
1939 current_processes += 1;
1940 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1941 logindex, pid, probeprov, probename,
1947 /* Should use SCRIPT but there is no access to this variable at that
1949 proc:::start /ppid == PID && execname == "Python"/ {
1950 trackedpid[pid] = 2;
1951 current_processes += 1;
1952 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1953 logindex, pid, probeprov, probename,
1959 proc:::exit /trackedpid[pid] &&
1960 current_processes == 1 &&
1961 waiting_to_die == 1/ {
1962 trackedpid[pid] = 0;
1963 current_processes -= 1;
1964 printf("%d %d %s_%s(%d)\\n",
1965 logindex, pid, probeprov, probename,
1970 proc:::exit /trackedpid[pid]/ {
1971 trackedpid[pid] = 0;
1972 current_processes -= 1;
1973 printf("%d %d %s_%s(%d)\\n",
1974 logindex, pid, probeprov, probename,
1979 /* Use an arcane function to detect when we need to die */
1980 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
1982 printf("%d %d %s()\\n", logindex, pid, probefunc);
1985 syscall::ftruncate:entry /
1986 pid == PID && arg0 == FILE_ID && current_processes == 0/ {
1990 syscall::open*:entry /trackedpid[pid] == 1/ {
1991 self->open_arg0 = arg0;
1992 self->open_arg1 = arg1;
1993 self->open_arg2 = arg2;
1995 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
1996 this->open_arg0 = copyinstr(self->open_arg0);
1997 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
1998 logindex, pid, probefunc,
2003 this->open_arg0 = 0;
2005 syscall::open*:return /trackedpid[pid] == 1/ {
2006 self->open_arg0 = 0;
2007 self->open_arg1 = 0;
2008 self->open_arg2 = 0;
2011 syscall::rename:entry /trackedpid[pid] == 1/ {
2012 self->rename_arg0 = arg0;
2013 self->rename_arg1 = arg1;
2015 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
2016 this->rename_arg0 = copyinstr(self->rename_arg0);
2017 this->rename_arg1 = copyinstr(self->rename_arg1);
2018 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
2019 logindex, pid, probefunc,
2023 this->rename_arg0 = 0;
2024 this->rename_arg1 = 0;
2026 syscall::rename:return /trackedpid[pid] == 1/ {
2027 self->rename_arg0 = 0;
2028 self->rename_arg1 = 0;
2031 /* Track chdir, it's painful because it is only receiving relative path.
2033 syscall::chdir:entry /trackedpid[pid] == 1/ {
2034 self->chdir_arg0 = arg0;
2036 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
2037 this->chdir_arg0 = copyinstr(self->chdir_arg0);
2038 printf("%d %d %s(\\"%s\\")\\n",
2039 logindex, pid, probefunc,
2042 this->chdir_arg0 = 0;
2044 syscall::chdir:return /trackedpid[pid] == 1/ {
2045 self->chdir_arg0 = 0;
2049 # execve-specific code, tends to throw a lot of exceptions.
2051 /* Finally what we care about! */
2052 syscall::exec*:entry /trackedpid[pid]/ {
2053 self->exec_arg0 = copyinstr(arg0);
2054 /* Incrementally probe for a NULL in the argv parameter of execve() to
2055 * figure out argc. */
2056 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
2058 self->exec_argc = 0;
2059 /* Probe for argc==1 */
2060 this->exec_argv = (user_addr_t*)copyin(
2061 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2062 self->exec_argc = this->exec_argv[self->exec_argc] ?
2063 (self->exec_argc + 1) : self->exec_argc;
2065 /* Probe for argc==2 */
2066 this->exec_argv = (user_addr_t*)copyin(
2067 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2068 self->exec_argc = this->exec_argv[self->exec_argc] ?
2069 (self->exec_argc + 1) : self->exec_argc;
2071 /* Probe for argc==3 */
2072 this->exec_argv = (user_addr_t*)copyin(
2073 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2074 self->exec_argc = this->exec_argv[self->exec_argc] ?
2075 (self->exec_argc + 1) : self->exec_argc;
2077 /* Probe for argc==4 */
2078 this->exec_argv = (user_addr_t*)copyin(
2079 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2080 self->exec_argc = this->exec_argv[self->exec_argc] ?
2081 (self->exec_argc + 1) : self->exec_argc;
2083 /* Copy the inputs strings since there is no guarantee they'll be
2084 * present after the call completed. */
2085 self->exec_argv0 = (self->exec_argc > 0) ?
2086 copyinstr(this->exec_argv[0]) : "";
2087 self->exec_argv1 = (self->exec_argc > 1) ?
2088 copyinstr(this->exec_argv[1]) : "";
2089 self->exec_argv2 = (self->exec_argc > 2) ?
2090 copyinstr(this->exec_argv[2]) : "";
2091 self->exec_argv3 = (self->exec_argc > 3) ?
2092 copyinstr(this->exec_argv[3]) : "";
2093 this->exec_argv = 0;
2095 syscall::exec*:return /trackedpid[pid] && errno == 0/ {
2096 /* We need to join strings here, as using multiple printf() would
2097 * cause tearing when multiple threads/processes are traced.
2098 * Since it is impossible to escape a string and join it to another one,
2099 * like sprintf("%s%S", previous, more), use hackery.
2100 * Each of the elements are split with a \\1. \\0 cannot be used because
2101 * it is simply ignored. This will conflict with any program putting a
2102 * \\1 in their execve() string but this should be "rare enough" */
2104 /* Process exec_argv[0] */
2105 this->args = strjoin(
2106 this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
2108 /* Process exec_argv[1] */
2109 this->args = strjoin(
2110 this->args, (self->exec_argc > 1) ? "\\1" : "");
2111 this->args = strjoin(
2112 this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
2114 /* Process exec_argv[2] */
2115 this->args = strjoin(
2116 this->args, (self->exec_argc > 2) ? "\\1" : "");
2117 this->args = strjoin(
2118 this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
2120 /* Process exec_argv[3] */
2121 this->args = strjoin(
2122 this->args, (self->exec_argc > 3) ? "\\1" : "");
2123 this->args = strjoin(
2124 this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
2126 /* Prints self->exec_argc to permits verifying the internal
2127 * consistency since this code is quite fishy. */
2128 printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
2129 logindex, pid, probefunc,
2136 syscall::exec*:return /trackedpid[pid]/ {
2137 self->exec_arg0 = 0;
2138 self->exec_argc = 0;
2139 self->exec_argv0 = 0;
2140 self->exec_argv1 = 0;
2141 self->exec_argv2 = 0;
2142 self->exec_argv3 = 0;
2146 # Code currently not used.
2148 /* This is a good learning experience, since it traces a lot of things
2149 * related to the process and child processes.
2150 * Warning: it generates a gigantic log. For example, tracing
2151 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
2152 * several minutes to execute.
2155 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2156 printf("%d %d %s_%s() = %d\\n",
2157 logindex, pid, probeprov, probefunc, errno);
2160 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2161 printf("%d %d %s_%s() = %d\\n",
2162 logindex, pid, probeprov, probefunc, errno);
2165 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2166 printf("%d %d %s_%s() = %d\\n",
2167 logindex, pid, probeprov, probefunc, errno);
2170 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2171 printf("%d %d %s_%s() = %d\\n",
2172 logindex, pid, probeprov, probefunc, errno);
2175 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2176 printf("%d %d %s_%s() = %d\\n",
2177 logindex, pid, probeprov, probefunc, errno);
2181 /* TODO(maruel): *stat* functions and friends
2182 syscall::access:return,
2183 syscall::chdir:return,
2184 syscall::chflags:return,
2185 syscall::chown:return,
2186 syscall::chroot:return,
2187 syscall::getattrlist:return,
2188 syscall::getxattr:return,
2189 syscall::lchown:return,
2190 syscall::lstat64:return,
2191 syscall::lstat:return,
2192 syscall::mkdir:return,
2193 syscall::pathconf:return,
2194 syscall::readlink:return,
2195 syscall::removexattr:return,
2196 syscall::setxattr:return,
2197 syscall::stat64:return,
2198 syscall::stat:return,
2199 syscall::truncate:return,
2200 syscall::unlink:return,
2201 syscall::utimes:return,
2205 def __init__(self, logname):
2206 """Starts the log collection with dtrace.
2208 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
2209 this needs to wait for dtrace to be "warmed up".
2211 super(Dtrace.Tracer, self).__init__(logname)
2212 self._script = create_thunk()
2213 # This unique dummy temp file is used to signal the dtrace script that it
2214 # should stop as soon as all the child processes are done. A bit hackish
2215 # but works fine enough.
2216 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
2217 prefix='trace_signal_file')
2219 # Note: do not use the -p flag. It's useless if the initial process quits
2220 # too fast, resulting in missing traces from the grand-children. The D
2221 # code manages the dtrace lifetime itself.
2225 # Use a larger buffer if getting 'out of scratch space' errors.
2226 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
2228 '-x', 'dynvarsize=10m',
2229 #'-x', 'dtrace_global_maxsize=1m',
2230 '-x', 'evaltime=exec',
2231 '-o', '/dev/stderr',
2233 '-n', self._get_dtrace_code(),
2235 with open(self._logname + '.log', 'wb') as logfile:
2236 self._dtrace = subprocess.Popen(
2237 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
2238 logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
2240 # Reads until one line is printed, which signifies dtrace is up and ready.
2241 with open(self._logname + '.log', 'rb') as logfile:
2242 while 'dtrace_BEGIN' not in logfile.readline():
2243 if self._dtrace.poll() is not None:
2244 # Do a busy wait. :/
2246 logging.debug('dtrace started')
2248 def _get_dtrace_code(self):
2249 """Setups the D code to implement child process tracking.
2251 Injects the cookie in the script so it knows when to stop.
2253 The script will detect any instance of the script created with
2254 create_thunk() and will start tracing it.
2257 'inline int PID = %d;\n'
2258 'inline string SCRIPT = "%s";\n'
2259 'inline int FILE_ID = %d;\n'
2264 self._dummy_file_id,
2266 if os.environ.get('TRACE_INPUTS_DTRACE_ENABLE_EXECVE') == '1':
2267 # Do not enable by default since it tends to spew dtrace: error lines
2268 # because the execve() parameters are not in valid memory at the time of
2270 # TODO(maruel): Find a way to make this reliable since it's useful but
2271 # only works in limited/trivial uses cases for now.
2272 out += self.D_CODE_EXECVE
2275 def trace(self, cmd, cwd, tracename, output):
2276 """Runs dtrace on an executable.
2278 This dtruss is broken when it starts the process itself or when tracing
2279 child processes, this code starts a wrapper process
2280 generated with create_thunk() which starts the executable to trace.
2282 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2283 assert os.path.isabs(cmd[0]), cmd[0]
2284 assert os.path.isabs(cwd), cwd
2285 assert os.path.normpath(cwd) == cwd, cwd
2287 if not self._initialized:
2288 raise TracingFailure(
2289 'Called Tracer.trace() on an unitialized object',
2290 None, None, None, tracename)
2291 assert tracename not in (i['trace'] for i in self._traces)
2293 # Starts the script wrapper to start the child process. This signals the
2294 # dtrace script that this process is to be traced.
2295 stdout = stderr = None
2297 stdout = subprocess.PIPE
2298 stderr = subprocess.STDOUT
2304 # Call a dummy function so that dtrace knows I'm about to launch a process
2305 # that needs to be traced.
2307 child = subprocess.Popen(
2308 child_cmd + fix_python_path(cmd),
2309 stdin=subprocess.PIPE,
2313 logging.debug('Started child pid: %d' % child.pid)
2315 out = child.communicate()[0]
2316 # This doesn't mean tracing is done, one of the grand-child process may
2317 # still be alive. It will be tracked with the dtrace script.
2320 assert tracename not in (i['trace'] for i in self._traces)
2321 self._traces.append(
2325 # The pid of strace process, not very useful.
2330 return child.returncode, out
2332 def close(self, timeout=None):
2333 """Terminates dtrace."""
2334 logging.debug('close(%s)' % timeout)
2337 super(Dtrace.Tracer, self).close(timeout)
2338 # Signal dtrace that it should stop now.
2339 # ftruncate doesn't exist on Windows.
2340 os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
2344 while (self._dtrace.poll() is None and
2345 (time.time() - start) < timeout):
2350 # Make sure to kill it in any case.
2351 if self._dtrace.poll() is None:
2358 if self._dtrace.returncode != 0:
2359 # Warn about any dtrace failure but basically ignore it.
2360 print 'dtrace failure: %s' % self._dtrace.returncode
2362 os.close(self._dummy_file_id)
2363 os.remove(self._dummy_file_name)
2364 os.remove(self._script)
2366 def post_process_log(self):
2367 """Sorts the log back in order when each call occured.
2369 dtrace doesn't save the buffer in strict order since it keeps one buffer
2372 super(Dtrace.Tracer, self).post_process_log()
2373 logname = self._logname + '.log'
2374 with open(logname, 'rb') as logfile:
2375 lines = [l for l in logfile if l.strip()]
2376 errors = [l for l in lines if l.startswith('dtrace:')]
2378 raise TracingFailure(
2379 'Found errors in the trace: %s' % '\n'.join(errors),
2380 None, None, None, logname)
2382 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
2384 raise TracingFailure(
2385 'Found errors in the trace: %s' % '\n'.join(
2386 l for l in lines if l.split(' ', 1)[0].isdigit()),
2387 None, None, None, logname)
2388 with open(logname, 'wb') as logfile:
2389 logfile.write(''.join(lines))
2392 def clean_trace(logname):
2393 for ext in ('', '.log'):
2394 if os.path.isfile(logname + ext):
2395 os.remove(logname + ext)
2398 def parse_log(cls, logname, blacklist, trace_name):
2399 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
2400 assert os.path.isabs(logname)
2402 def blacklist_more(filepath):
2403 # All the HFS metadata is in the form /.vol/...
2404 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
2406 data = read_json(logname)
2408 for item in data['traces']:
2409 if trace_name and item['trace'] != trace_name:
2412 'output': item['output'],
2413 'trace': item['trace'],
2416 context = cls.Context(blacklist_more, item['pid'], item['cwd'])
2417 # It's fine to assume the file as UTF-8: OSX enforces the file names to
2418 # be valid UTF-8 and we control the log output.
2419 for line in codecs.open(logname + '.log', 'rb', encoding='utf-8'):
2420 context.on_line(line)
2421 result['results'] = context.to_results()
2422 except TracingFailure:
2423 result['exception'] = sys.exc_info()
2428 class LogmanTrace(ApiBase):
2429 """Uses the native Windows ETW based tracing functionality to trace a child
2432 Caveat: this implementations doesn't track cwd or initial_cwd. It is because
2433 the Windows Kernel doesn't have a concept of 'current working directory' at
2434 all. A Win32 process has a map of current directories, one per drive letter
2435 and it is managed by the user mode kernel32.dll. In kernel, a file is always
2436 opened relative to another file_object or as an absolute path. All the current
2437 working directory logic is done in user mode.
2439 class Context(ApiBase.Context):
2440 """Processes a ETW log line and keeps the list of existent and non
2441 existent files accessed.
2443 Ignores directories.
2445 # These indexes are for the stripped version in json.
2454 class Process(ApiBase.Context.Process):
2455 def __init__(self, *args):
2456 super(LogmanTrace.Context.Process, self).__init__(*args)
2457 # Handle file objects that succeeded.
2458 self.file_objects = {}
2460 def __init__(self, blacklist, thunk_pid, trace_name, thunk_cmd):
2462 '%s(%d, %s, %s)', self.__class__.__name__, thunk_pid, trace_name,
2464 super(LogmanTrace.Context, self).__init__(blacklist)
2465 self._drive_map = DosDriveMap()
2466 # Threads mapping to the corresponding process id.
2467 self._threads_active = {}
2468 # Process ID of the tracer, e.g. the temporary script created by
2469 # create_thunk(). This is tricky because the process id may have been
2471 self._thunk_pid = thunk_pid
2472 self._thunk_cmd = thunk_cmd
2473 self._trace_name = trace_name
2474 self._line_number = 0
2475 self._thunk_process = None
2477 def on_line(self, line):
2478 """Processes a json Event line."""
2479 self._line_number += 1
2484 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
2487 raise TracingFailure(
2488 'Unexpected event %s_%s' % (
2489 line[self.EVENT_NAME], line[self.TYPE]),
2492 except TracingFailure, e:
2493 # Hack in the values since the handler could be a static function.
2494 e.pid = line[self.PID]
2496 e.line_number = self._line_number
2497 # Re-raise the modified exception.
2499 except (KeyError, NotImplementedError, ValueError), e:
2500 raise TracingFailure(
2501 'Trace generated a %s exception: %s' % (
2502 e.__class__.__name__, str(e)),
2508 def to_results(self):
2509 if not self.root_process:
2510 raise TracingFailure(
2511 'Failed to detect the initial process %d' % self._thunk_pid,
2513 process = self.root_process.to_results_process()
2514 return Results(process)
2516 def _thread_to_process(self, tid):
2517 """Finds the process from the thread id."""
2519 pid = self._threads_active.get(tid)
2520 if not pid or not self._process_lookup.get(pid):
2522 return self._process_lookup[pid]
2525 def handle_EventTrace_Header(cls, line):
2526 """Verifies no event was dropped, e.g. no buffer overrun occured."""
2527 BUFFER_SIZE = cls.USER_DATA
2528 #VERSION = cls.USER_DATA + 1
2529 #PROVIDER_VERSION = cls.USER_DATA + 2
2530 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
2531 #END_TIME = cls.USER_DATA + 4
2532 #TIMER_RESOLUTION = cls.USER_DATA + 5
2533 #MAX_FILE_SIZE = cls.USER_DATA + 6
2534 #LOG_FILE_MODE = cls.USER_DATA + 7
2535 #BUFFERS_WRITTEN = cls.USER_DATA + 8
2536 #START_BUFFERS = cls.USER_DATA + 9
2537 #POINTER_SIZE = cls.USER_DATA + 10
2538 EVENTS_LOST = cls.USER_DATA + 11
2539 #CPU_SPEED = cls.USER_DATA + 12
2540 #LOGGER_NAME = cls.USER_DATA + 13
2541 #LOG_FILE_NAME = cls.USER_DATA + 14
2542 #BOOT_TIME = cls.USER_DATA + 15
2543 #PERF_FREQ = cls.USER_DATA + 16
2544 #START_TIME = cls.USER_DATA + 17
2545 #RESERVED_FLAGS = cls.USER_DATA + 18
2546 #BUFFERS_LOST = cls.USER_DATA + 19
2547 #SESSION_NAME_STRING = cls.USER_DATA + 20
2548 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
2549 if line[EVENTS_LOST] != '0':
2550 raise TracingFailure(
2551 ( '%s events were lost during trace, please increase the buffer '
2552 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
2555 def handle_FileIo_Cleanup(self, line):
2556 """General wisdom: if a file is closed, it's because it was opened.
2558 Note that FileIo_Close is not used since if a file was opened properly but
2559 not closed before the process exits, only Cleanup will be logged.
2561 #IRP = self.USER_DATA
2562 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2563 FILE_OBJECT = self.USER_DATA + 2
2564 #FILE_KEY = self.USER_DATA + 3
2565 proc = self._thread_to_process(line[TTID])
2567 # Not a process we care about.
2569 file_object = line[FILE_OBJECT]
2570 if file_object in proc.file_objects:
2571 proc.add_file(proc.file_objects.pop(file_object), False)
2573 def handle_FileIo_Create(self, line):
2574 """Handles a file open.
2576 All FileIo events are described at
2577 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
2578 for some value of 'description'.
2580 " (..) process and thread id values of the IO events (..) are not valid "
2581 http://msdn.microsoft.com/magazine/ee358703.aspx
2583 The FileIo.Create event doesn't return if the CreateFile() call
2584 succeeded, so keep track of the file_object and check that it is
2585 eventually closed with FileIo_Cleanup.
2587 #IRP = self.USER_DATA
2588 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2589 FILE_OBJECT = self.USER_DATA + 2
2590 #CREATE_OPTIONS = self.USER_DATA + 3
2591 #FILE_ATTRIBUTES = self.USER_DATA + 4
2592 #self.USER_DATA + SHARE_ACCESS = 5
2593 OPEN_PATH = self.USER_DATA + 6
2595 proc = self._thread_to_process(line[TTID])
2597 # Not a process we care about.
2600 raw_path = line[OPEN_PATH]
2601 # Ignore directories and bare drive right away.
2602 if raw_path.endswith(os.path.sep):
2604 filepath = self._drive_map.to_win32(raw_path)
2605 # Ignore bare drive right away. Some may still fall through with format
2607 if len(filepath) == 2:
2609 file_object = line[FILE_OBJECT]
2610 if os.path.isdir(filepath):
2611 # There is no O_DIRECTORY equivalent on Windows. The closed is
2612 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
2613 # simply discard directories are they are found.
2615 # Override any stale file object
2616 proc.file_objects[file_object] = filepath
2618 def handle_FileIo_Rename(self, line):
2619 # TODO(maruel): Handle?
2622 def handle_Process_End(self, line):
2623 pid = line[self.PID]
2624 if self._process_lookup.get(pid):
2625 logging.info('Terminated: %d' % pid)
2626 self._process_lookup[pid] = None
2628 logging.debug('Terminated: %d' % pid)
2629 if self._thunk_process and self._thunk_process.pid == pid:
2630 self._thunk_process = None
2632 def handle_Process_Start(self, line):
2633 """Handles a new child process started by PID."""
2634 #UNIQUE_PROCESS_KEY = self.USER_DATA
2635 PROCESS_ID = self.USER_DATA + 1
2636 #PARENT_PID = self.USER_DATA + 2
2637 #SESSION_ID = self.USER_DATA + 3
2638 #EXIT_STATUS = self.USER_DATA + 4
2639 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
2640 #USER_SID = self.USER_DATA + 6
2641 IMAGE_FILE_NAME = self.USER_DATA + 7
2642 COMMAND_LINE = self.USER_DATA + 8
2644 ppid = line[self.PID]
2645 pid = int(line[PROCESS_ID], 16)
2646 command_line = CommandLineToArgvW(line[COMMAND_LINE])
2648 'New process %d->%d (%s) %s' %
2649 (ppid, pid, line[IMAGE_FILE_NAME], command_line))
2651 if pid == self._thunk_pid:
2652 # Need to ignore processes we don't know about because the log is
2653 # system-wide. self._thunk_pid shall start only one process.
2654 # This is tricky though because Windows *loves* to reuse process id and
2655 # it happens often that the process ID of the thunk script created by
2656 # create_thunk() is reused. So just detecting the pid here is not
2657 # sufficient, we must confirm the command line.
2658 if command_line[:len(self._thunk_cmd)] != self._thunk_cmd:
2660 'Ignoring duplicate pid %d for %s: %s while searching for %s',
2661 pid, self._trace_name, command_line, self._thunk_cmd)
2664 # TODO(maruel): The check is quite weak. Add the thunk path.
2665 if self._thunk_process:
2666 raise TracingFailure(
2667 ( 'Parent process is _thunk_pid(%d) but thunk_process(%d) is '
2668 'already set') % (self._thunk_pid, self._thunk_process.pid),
2670 proc = self.Process(self.blacklist, pid, None)
2671 self._thunk_process = proc
2673 elif ppid == self._thunk_pid and self._thunk_process:
2674 proc = self.Process(self.blacklist, pid, None)
2675 self.root_process = proc
2677 elif self._process_lookup.get(ppid):
2678 proc = self.Process(self.blacklist, pid, None)
2679 self._process_lookup[ppid].children.append(proc)
2683 self._process_lookup[pid] = proc
2685 proc.command = command_line
2686 proc.executable = line[IMAGE_FILE_NAME]
2687 # proc.command[0] may be the absolute path of 'executable' but it may be
2688 # anything else too. If it happens that command[0] ends with executable,
2689 # use it, otherwise defaults to the base name.
2690 cmd0 = proc.command[0].lower()
2691 if not cmd0.endswith('.exe'):
2692 # TODO(maruel): That's not strictly true either.
2694 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
2696 cmd0 = cmd0.replace('/', os.path.sep)
2697 cmd0 = os.path.normpath(cmd0)
2698 proc.executable = get_native_path_case(cmd0)
2700 'New child: %s -> %d %s' % (ppid, pid, proc.executable))
2702 def handle_Thread_End(self, line):
2703 """Has the same parameters as Thread_Start."""
2704 tid = int(line[self.TID], 16)
2705 self._threads_active.pop(tid, None)
2707 def handle_Thread_Start(self, line):
2708 """Handles a new thread created.
2710 Do not use self.PID here since a process' initial thread is created by
2713 PROCESS_ID = self.USER_DATA
2714 TTHREAD_ID = self.USER_DATA + 1
2715 #STACK_BASE = self.USER_DATA + 2
2716 #STACK_LIMIT = self.USER_DATA + 3
2717 #USER_STACK_BASE = self.USER_DATA + 4
2718 #USER_STACK_LIMIT = self.USER_DATA + 5
2719 #AFFINITY = self.USER_DATA + 6
2720 #WIN32_START_ADDR = self.USER_DATA + 7
2721 #TEB_BASE = self.USER_DATA + 8
2722 #SUB_PROCESS_TAG = self.USER_DATA + 9
2723 #BASE_PRIORITY = self.USER_DATA + 10
2724 #PAGE_PRIORITY = self.USER_DATA + 11
2725 #IO_PRIORITY = self.USER_DATA + 12
2726 #THREAD_FLAGS = self.USER_DATA + 13
2727 # Do not use self.PID here since a process' initial thread is created by
2728 # the parent process.
2729 pid = int(line[PROCESS_ID], 16)
2730 tid = int(line[TTHREAD_ID], 16)
2731 logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
2732 self._threads_active[tid] = pid
2735 def supported_events(cls):
2736 """Returns all the procesed events."""
2738 for member in dir(cls):
2739 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
2741 out.append(match.groups())
2744 class Tracer(ApiBase.Tracer):
2745 # The basic headers.
2758 u'Processor Number',
2760 u'Parent Instance ID',
2762 u'Related Activity ID', # 15
2764 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
2765 u'User(ms)', # pretty much useless.
2766 u'User Data', # Extra arguments that are event-specific.
2768 # Only the useful headers common to all entries are listed there. Any column
2769 # at 19 or higher is dependent on the specific event.
2776 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
2779 class CsvReader(object):
2780 """CSV reader that reads files generated by tracerpt.exe.
2782 csv.reader() fails to read them properly, it mangles file names quoted
2783 with "" with a comma in it.
2785 # 0. Had a ',' or one of the following ' ' after a comma, next should
2786 # be ' ', '"' or string or ',' for an empty field.
2788 # 1. Processing an unquoted field up to ','.
2790 # 2. Processing a new field starting with '"'.
2791 STARTING_STR_QUOTED,
2792 # 3. Second quote in a row at the start of a field. It could be either
2793 # '""foo""' or '""'. Who the hell thought it was a great idea to use
2794 # the same character for delimiting and escaping?
2795 STARTING_SECOND_QUOTE,
2796 # 4. A quote inside a quoted string where the previous character was
2797 # not a quote, so the string is not empty. Can be either: end of a
2798 # quoted string (a delimiter) or a quote escape. The next char must be
2799 # either '"' or ','.
2800 HAD_QUOTE_IN_QUOTED,
2801 # 5. Second quote inside a quoted string.
2802 HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED,
2803 # 6. Processing a field that started with '"'.
2804 IN_STR_QUOTED) = range(7)
2806 def __init__(self, f):
2813 """Splits the line in fields."""
2814 line = self.f.readline()
2816 raise StopIteration()
2819 state = self.HAD_DELIMITER
2820 for i, c in enumerate(line):
2821 if state == self.HAD_DELIMITER:
2826 # Ignore initial whitespaces
2829 state = self.STARTING_STR_QUOTED
2832 # Start of a new field.
2836 elif state == self.IN_STR:
2837 # Do not accept quote inside unquoted field.
2838 assert c != '"', (i, c, line, fields)
2840 fields[-1] = fields[-1].strip()
2841 state = self.HAD_DELIMITER
2843 fields[-1] = fields[-1] + c
2845 elif state == self.STARTING_STR_QUOTED:
2847 # Do not store the character yet.
2848 state = self.STARTING_SECOND_QUOTE
2850 state = self.IN_STR_QUOTED
2851 fields[-1] = fields[-1] + c
2853 elif state == self.STARTING_SECOND_QUOTE:
2855 # It was an empty field. '""' == ''.
2856 state = self.HAD_DELIMITER
2858 fields[-1] = fields[-1] + '"' + c
2859 state = self.IN_STR_QUOTED
2861 elif state == self.HAD_QUOTE_IN_QUOTED:
2863 # End of the string.
2864 state = self.HAD_DELIMITER
2866 state = self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED
2868 # The previous double-quote was just an unescaped quote.
2869 fields[-1] = fields[-1] + '"' + c
2870 state = self.IN_STR_QUOTED
2872 elif state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
2874 # End of the string.
2875 state = self.HAD_DELIMITER
2876 fields[-1] = fields[-1] + '"'
2878 assert False, (i, c, line, fields)
2880 elif state == self.IN_STR_QUOTED:
2882 # Could be a delimiter or an escape.
2883 state = self.HAD_QUOTE_IN_QUOTED
2885 fields[-1] = fields[-1] + c
2887 if state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
2888 fields[-1] = fields[-1] + '"'
2891 # Terminated with a normal field.
2893 # Terminated with an empty field.
2894 self.STARTING_SECOND_QUOTE,
2895 # Terminated with a normal quoted field.
2896 self.HAD_QUOTE_IN_QUOTED), (
2897 line, state, fields)
2900 def __init__(self, logname):
2901 """Starts the log collection.
2903 Requires administrative access. logman.exe is synchronous so no need for a
2904 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID
2905 instead. The GUID constant name is SystemTraceControlGuid. Lovely.
2907 One can get the list of potentially interesting providers with:
2908 "logman query providers | findstr /i file"
2910 super(LogmanTrace.Tracer, self).__init__(logname)
2911 self._script = create_thunk()
2916 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
2917 # splitio,fileiocompletion,syscall,file,cswitch,img
2918 '(process,fileio,thread)',
2919 '-o', self._logname + '.etl',
2920 '-ets', # Send directly to kernel
2921 # Values extracted out of thin air.
2922 # Event Trace Session buffer size in kb.
2924 # Number of Event Trace Session buffers.
2927 logging.debug('Running: %s' % cmd_start)
2929 subprocess.check_call(
2931 stdin=subprocess.PIPE,
2932 stdout=subprocess.PIPE,
2933 stderr=subprocess.STDOUT)
2934 except subprocess.CalledProcessError, e:
2935 if e.returncode == -2147024891:
2936 print >> sys.stderr, 'Please restart with an elevated admin prompt'
2937 elif e.returncode == -2144337737:
2938 print >> sys.stderr, (
2939 'A kernel trace was already running, stop it and try again')
2942 def trace(self, cmd, cwd, tracename, output):
2943 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2944 assert os.path.isabs(cmd[0]), cmd[0]
2945 assert os.path.isabs(cwd), cwd
2946 assert os.path.normpath(cwd) == cwd, cwd
2948 if not self._initialized:
2949 raise TracingFailure(
2950 'Called Tracer.trace() on an unitialized object',
2951 None, None, None, tracename)
2952 assert tracename not in (i['trace'] for i in self._traces)
2954 # Use "logman -?" for help.
2956 stdout = stderr = None
2958 stdout = subprocess.PIPE
2959 stderr = subprocess.STDOUT
2961 # Run the child process.
2962 logging.debug('Running: %s' % cmd)
2963 # Use the temporary script generated with create_thunk() so we have a
2964 # clear pid owner. Since trace_inputs.py can be used as a library and
2965 # could trace multiple processes simultaneously, it makes it more complex
2966 # if the executable to be traced is executed directly here. It also solves
2967 # issues related to logman.exe that needs to be executed to control the
2974 child = subprocess.Popen(
2975 child_cmd + fix_python_path(cmd),
2977 stdin=subprocess.PIPE,
2980 logging.debug('Started child pid: %d' % child.pid)
2981 out = child.communicate()[0]
2982 # This doesn't mean all the grand-children are done. Sadly, we don't have
2983 # a good way to determine that.
2986 assert tracename not in (i['trace'] for i in self._traces)
2987 self._traces.append({
2992 # Used to figure out the real process when process ids are reused.
2993 'thunk_cmd': child_cmd,
2997 return child.returncode, out
2999 def close(self, _timeout=None):
3000 """Stops the kernel log collection and converts the traces to text
3004 if not self._initialized:
3005 raise TracingFailure(
3006 'Called Tracer.close() on an unitialized object',
3008 os.remove(self._script)
3009 # Save metadata, add 'format' key..
3012 'traces': self._traces,
3014 write_json(self._logname, data, False)
3020 '-ets', # Sends the command directly to the kernel.
3022 logging.debug('Running: %s' % cmd_stop)
3023 subprocess.check_call(
3025 stdin=subprocess.PIPE,
3026 stdout=subprocess.PIPE,
3027 stderr=subprocess.STDOUT)
3028 self._initialized = False
3030 def post_process_log(self):
3031 """Converts the .etl file into .csv then into .json."""
3032 super(LogmanTrace.Tracer, self).post_process_log()
3034 self._convert_log(logformat)
3035 self._trim_log(logformat)
3037 def _trim_log(self, logformat):
3038 """Reduces the amount of data in original log by generating a 'reduced'
3041 if logformat == 'csv_utf16':
3042 file_handle = codecs.open(
3043 self._logname + '.' + logformat, 'r', encoding='utf-16')
3045 elif logformat == 'csv':
3046 assert sys.getfilesystemencoding() == 'mbcs'
3047 file_handle = codecs.open(
3048 self._logname + '.' + logformat, 'r',
3049 encoding=get_current_encoding())
3051 supported_events = LogmanTrace.Context.supported_events()
3053 def trim(generator):
3054 """Loads items from the generator and returns the interesting data.
3056 It filters out any uninteresting line and reduce the amount of data in
3059 for index, line in enumerate(generator):
3061 if line != self.EXPECTED_HEADER:
3062 raise TracingFailure(
3063 'Found malformed header: %s' % line,
3066 # As you can see, the CSV is full of useful non-redundant information:
3067 if (line[2] != '0' or # Event ID
3068 line[3] not in ('2', '3') or # Version
3069 line[4] != '0' or # Channel
3070 line[5] != '0' or # Level
3071 line[7] != '0' or # Task
3072 line[8] != '0x0000000000000000' or # Keyword
3073 line[12] != '' or # Instance ID
3074 line[13] != '' or # Parent Instance ID
3075 line[14] != self.NULL_GUID or # Activity ID
3076 line[15] != ''): # Related Activity ID
3077 raise TracingFailure(
3078 'Found unexpected values in line: %s' % ' '.join(line),
3081 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
3085 line[self.EVENT_NAME],
3089 line[self.PROCESSOR_ID],
3090 line[self.TIMESTAMP],
3091 ] + line[self.USER_DATA:]
3093 # must not convert the trim() call into a list, since it will use too much
3094 # memory for large trace. use a csv file as a workaround since the json
3095 # parser requires a complete in-memory file.
3096 with open('%s.preprocessed' % self._logname, 'wb') as f:
3097 # $ and * can't be used in file name on windows, reducing the likelihood
3098 # of having to escape a string.
3100 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
3101 for line in trim(self.CsvReader(file_handle)):
3102 out.writerow([s.encode('utf-8') for s in line])
3104 def _convert_log(self, logformat):
3105 """Converts the ETL trace to text representation.
3107 Normally, 'csv' is sufficient. If complex scripts are used (like eastern
3108 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
3111 - logformat: Text format to be generated, csv, csv_utf16 or xml.
3113 Use "tracerpt -?" for help.
3115 LOCALE_INVARIANT = 0x7F
3116 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
3119 '-l', self._logname + '.etl',
3120 '-o', self._logname + '.' + logformat,
3123 # Use -of XML to get the header of each items after column 19, e.g. all
3124 # the actual headers of 'User Data'.
3127 if logformat == 'csv':
3128 # tracerpt localizes the 'Type' column, for major brainfuck
3129 # entertainment. I can't imagine any sane reason to do that.
3130 cmd_convert.extend(['-of', 'CSV'])
3131 elif logformat == 'csv_utf16':
3132 # This causes it to use UTF-16, which doubles the log size but ensures
3133 # the log is readable for non-ASCII characters.
3134 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
3135 elif logformat == 'xml':
3136 cmd_convert.extend(['-of', 'XML'])
3138 raise ValueError('Unexpected log format \'%s\'' % logformat)
3139 logging.debug('Running: %s' % cmd_convert)
3140 # This can takes tens of minutes for large logs.
3141 # Redirects all output to stderr.
3142 subprocess.check_call(
3144 stdin=subprocess.PIPE,
3149 def clean_trace(logname):
3150 for ext in ('', '.csv', '.etl', '.json', '.xml', '.preprocessed'):
3151 if os.path.isfile(logname + ext):
3152 os.remove(logname + ext)
3155 def parse_log(cls, logname, blacklist, trace_name):
3156 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
3157 assert os.path.isabs(logname)
3159 def blacklist_more(filepath):
3160 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
3161 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
3163 # Create a list of (Context, result_dict) tuples. This is necessary because
3164 # the csv file may be larger than the amount of available memory.
3168 blacklist_more, item['pid'], item['trace'], item['thunk_cmd']),
3170 'output': item['output'],
3171 'trace': item['trace'],
3174 for item in read_json(logname)['traces']
3175 if not trace_name or item['trace'] == trace_name
3178 # The log may be too large to fit in memory and it is not efficient to read
3179 # it multiple times, so multiplex the contexes instead, which is slightly
3181 with open('%s.preprocessed' % logname, 'rb') as f:
3183 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
3184 for encoded in lines:
3185 line = [s.decode('utf-8') for s in encoded]
3186 # Convert the PID in-place from hex.
3187 line[cls.Context.PID] = int(line[cls.Context.PID], 16)
3188 for context in contexes:
3189 if 'exception' in context[1]:
3192 context[0].on_line(line)
3193 except TracingFailure:
3194 context[1]['exception'] = sys.exc_info()
3196 for context in contexes:
3197 if 'exception' in context[1]:
3199 context[1]['results'] = context[0].to_results()
3201 return [context[1] for context in contexes]
3205 """Returns the correct implementation for the current OS."""
3206 if sys.platform == 'cygwin':
3207 raise NotImplementedError(
3208 'Not implemented for cygwin, start the script from Win32 python')
3210 'win32': LogmanTrace,
3216 # Defaults to strace.
3217 return flavors.get(sys.platform, Strace)()
3220 def extract_directories(root_dir, files, blacklist):
3221 """Detects if all the files in a directory are in |files| and if so, replace
3222 the individual files by a Results.Directory instance.
3224 Takes a list of Results.File instances and returns a shorter list of
3225 Results.File and Results.Directory instances.
3228 - root_dir: Optional base directory that shouldn't be search further.
3229 - files: list of Results.File instances.
3230 - blacklist: regexp of files to ignore, for example r'.+\.pyc'.
3233 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
3234 assert not (root_dir or '').endswith(os.path.sep), root_dir
3235 # It is important for root_dir to not be a symlinked path, make sure to call
3236 # os.path.realpath() as needed.
3237 assert not root_dir or (
3238 os.path.realpath(get_native_path_case(root_dir)) == root_dir)
3239 assert not any(isinstance(f, Results.Directory) for f in files)
3240 # Remove non existent files.
3241 files = [f for f in files if f.existent]
3244 # All files must share the same root, which can be None.
3245 assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
3247 # Creates a {directory: {filename: File}} mapping, up to root.
3250 buckets[root_dir] = {}
3251 for fileobj in files:
3252 path = fileobj.full_path
3253 directory = os.path.dirname(path)
3255 # Do not use os.path.basename() so trailing os.path.sep is kept.
3256 basename = path[len(directory)+1:]
3257 files_in_directory = buckets.setdefault(directory, {})
3258 files_in_directory[basename] = fileobj
3259 # Add all the directories recursively up to root.
3262 directory = os.path.dirname(directory)
3263 if directory + os.path.sep == root_dir or directory == old_d:
3265 buckets.setdefault(directory, {})
3267 root_prefix = len(root_dir) + 1 if root_dir else 0
3268 for directory in sorted(buckets, reverse=True):
3269 actual = set(f for f in os.listdir(directory) if not blacklist(f))
3270 expected = set(buckets[directory])
3271 if not (actual - expected):
3272 parent = os.path.dirname(directory)
3273 buckets[parent][os.path.basename(directory)] = Results.Directory(
3275 directory[root_prefix:],
3277 sum(f.size for f in buckets[directory].itervalues()),
3278 sum(f.nb_files for f in buckets[directory].itervalues()))
3279 # Remove the whole bucket.
3280 del buckets[directory]
3282 # Reverse the mapping with what remains. The original instances are returned,
3283 # so the cached meta data is kept.
3284 files = sum((x.values() for x in buckets.itervalues()), [])
3285 return sorted(files, key=lambda x: x.path)
3288 def trace(logfile, cmd, cwd, api, output):
3289 """Traces an executable. Returns (returncode, output) from api.
3292 - logfile: file to write to.
3293 - cmd: command to run.
3294 - cwd: current directory to start the process in.
3295 - api: a tracing api instance.
3296 - output: if True, returns output, otherwise prints it at the console.
3298 cmd = fix_python_path(cmd)
3299 api.clean_trace(logfile)
3300 with api.get_tracer(logfile) as tracer:
3301 return tracer.trace(cmd, cwd, 'default', output)
3305 """Cleans up traces."""
3306 parser = OptionParserTraceInputs(command='clean')
3307 options, args = parser.parse_args(args)
3309 api.clean_trace(options.log)
3314 """Traces an executable."""
3315 parser = OptionParserTraceInputs(command='trace')
3316 parser.allow_interspersed_args = False
3318 '-q', '--quiet', action='store_true',
3319 help='Redirects traced executable output to /dev/null')
3320 options, args = parser.parse_args(args)
3323 parser.error('Please provide a command to run')
3325 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
3326 args[0] = os.path.abspath(args[0])
3329 return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
3333 """Reads the logs and prints the result."""
3334 parser = OptionParserTraceInputs(command='read')
3340 metavar='VAR_NAME directory',
3342 help=('Variables to replace relative directories against. Example: '
3343 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
3344 'home dir with $HOME') % getpass.getuser())
3347 help='Root directory to base everything off it. Anything outside of this '
3348 'this directory will not be reported')
3351 help='Only reads one of the trace. Defaults to reading all traces')
3353 '-j', '--json', action='store_true',
3354 help='Outputs raw result data as json')
3356 '-b', '--blacklist', action='append', default=[],
3357 help='List of regexp to use as blacklist filter')
3358 options, args = parser.parse_args(args)
3360 if options.root_dir:
3361 options.root_dir = unicode(os.path.abspath(options.root_dir))
3363 variables = dict(options.variables)
3366 return any(re.match(b, f) for b in options.blacklist)
3367 data = api.parse_log(options.log, blacklist, options.trace_name)
3368 # Process each trace.
3371 if 'exception' in item:
3372 # Do not abort the other traces.
3373 print >> sys.stderr, (
3374 'Trace %s: Got an exception: %s' % (
3375 item['trace'], item['exception'][1]))
3377 results = item['results']
3378 if options.root_dir:
3379 results = results.strip_root(options.root_dir)
3382 output_as_json.append(results.flatten())
3384 simplified = extract_directories(
3385 options.root_dir, results.files, blacklist)
3386 simplified = [f.replace_variables(variables) for f in simplified]
3388 print('Trace: %s' % item['trace'])
3389 print('Total: %d' % len(results.files))
3390 print('Non existent: %d' % len(results.non_existent))
3391 for f in results.non_existent:
3392 print(' %s' % f.path)
3394 'Interesting: %d reduced to %d' % (
3395 len(results.existent), len(simplified)))
3396 for f in simplified:
3397 print(' %s' % f.path)
3400 write_json(sys.stdout, output_as_json, False)
3404 class OptionParserWithLogging(optparse.OptionParser):
3405 """Adds --verbose option."""
3406 def __init__(self, verbose=0, **kwargs):
3407 optparse.OptionParser.__init__(self, **kwargs)
3412 help='Use multiple times to increase verbosity')
3414 def parse_args(self, *args, **kwargs):
3415 options, args = optparse.OptionParser.parse_args(self, *args, **kwargs)
3416 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
3417 logging.basicConfig(
3418 level=levels[min(len(levels)-1, options.verbose)],
3419 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
3420 return options, args
3423 class OptionParserWithNiceDescription(OptionParserWithLogging):
3424 """Generates the description with the command's docstring."""
3425 def __init__(self, **kwargs):
3426 """Sets 'description' and 'usage' if not already specified."""
3427 command = kwargs.pop('command', 'help')
3430 re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
3431 kwargs.setdefault('usage', '%%prog %s [options]' % command)
3432 OptionParserWithLogging.__init__(self, **kwargs)
3435 class OptionParserTraceInputs(OptionParserWithNiceDescription):
3436 """Adds automatic --log handling."""
3437 def __init__(self, **kwargs):
3438 OptionParserWithNiceDescription.__init__(self, **kwargs)
3440 '-l', '--log', help='Log file to generate or read, required')
3442 def parse_args(self, *args, **kwargs):
3443 """Makes sure the paths make sense.
3445 On Windows, / and \ are often mixed together in a path.
3447 options, args = OptionParserWithNiceDescription.parse_args(
3448 self, *args, **kwargs)
3450 self.error('Must supply a log file with -l')
3451 options.log = os.path.abspath(options.log)
3452 return options, args
3455 def extract_documentation():
3456 """Returns a dict {command: description} for each of documented command."""
3459 for fn in dir(sys.modules['__main__'])
3460 if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
3461 return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
3465 """Prints list of commands or help for a specific command."""
3466 doc = extract_documentation()
3467 # Calculates the optimal offset.
3468 offset = max(len(cmd) for cmd in doc)
3469 format_str = ' %-' + str(offset + 2) + 's %s'
3470 # Generate a one-liner documentation of each commands.
3471 commands_description = '\n'.join(
3472 format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
3474 parser = OptionParserWithNiceDescription(
3475 usage='%prog <command> [options]',
3476 description='Commands are:\n%s\n' % commands_description)
3477 parser.format_description = lambda _: parser.description
3479 # Strip out any -h or --help argument.
3480 _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
3482 if not get_command_handler(args[0]):
3483 parser.error('Unknown command %s' % args[0])
3484 # The command was "%prog help command", replaces ourself with
3485 # "%prog command --help" so help is correctly printed out.
3486 return main(args + ['--help'])
3488 parser.error('Unknown argument "%s"' % ' '.join(args))
3493 def get_command_handler(name):
3494 """Returns the command handler or CMDhelp if it doesn't exist."""
3495 return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
3498 def main_impl(argv):
3499 command = get_command_handler(argv[0] if argv else 'help')
3501 return CMDhelp(argv)
3502 return command(argv[1:])
3507 except TracingFailure, e:
3508 sys.stderr.write('\nError: ')
3509 sys.stderr.write(str(e))
3510 sys.stderr.write('\n')
3514 if __name__ == '__main__':
3515 sys.exit(main(sys.argv[1:]))