forked from LinkedInAttic/JTune
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjtune.py
executable file
·1942 lines (1486 loc) · 83.1 KB
/
jtune.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
@author Eric Bullen <[email protected]>
@application jtune.py
@version 1.2
@abstract This tool will give detailed information about the running
JVM in real-time. It produces useful information that can
further assist the user in debugging and optimization.
@license Copyright 2015 LinkedIn Corp. All rights reserved.
Licensed under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
either express or implied.
"""
import atexit
import datetime
import getpass
import locale
import logging
import math
import os
import pickle
import re
import resource
import shlex
import socket
import subprocess as sp
import sys
import textwrap
import time
from decimal import Decimal
from itertools import izip_longest, izip, count
import argparse
import multiprocessing as mp
try:
locale.setlocale(locale.LC_ALL, 'en_US')
except locale.Error:
# Try UTF8 variant before failing
locale.setlocale(locale.LC_ALL, 'en_US.utf8')
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s: "%(name)s" (line: %(lineno)d) - %(levelname)s: %(message)s'))
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(handler)
# For me to use in PyCharm to read flight recorder files
DEBUG = False
class GCRecord(object):
"""Object definition for a single gc record."""
_version = "1.0"
def __init__(self, raw_gc_record=None):
if raw_gc_record:
self.raw_gc_record = raw_gc_record
else:
self.raw_gc_record = list()
self.is_cms_gc = False
self.is_stw_gc = False
self.cms_sweep_time = None
self.valid_record = False
self.record_timestamp = None
self.jvm_running_time = None
self.gc_type = None
self.desired_survivor_size = None
self.curr_threshold = None
self.max_threshold = None
self.ages = list()
self.young_size_before_gc = None
self.young_size_after_gc = None
self.young_size_total = None
self.young_gc_time = 0
self.total_heap_before_gc = None
self.total_heap_after_gc = None
self.total_heap = None
self.total_gc_time = 0
self.og_used = None
self.stw_time = 0
self._parse_record()
def __repr__(self):
"""This prints out the gc record so that it looks as though it came straight from
the logs."""
output = list()
output.append("{0} Runtime: {1} GC Type: {2}".format(self.record_timestamp, self.jvm_running_time, self.gc_type))
output.append("Desired Survivor Size: {0}, Curr Threshold: {1} (Max: {2})".format(self.desired_survivor_size, self.curr_threshold, self.max_threshold))
for age in self.ages:
if age[1] > -1 or age[2] > -1:
output.append("- Age {0}: {1:>10} bytes, {2:>10} total".format(age[0], age[1], age[2]))
output.append("YG Before GC: {0}K, YG After GC: {1}K (Total: {2}K), {3} secs".format(self.young_size_before_gc, self.young_size_after_gc, self.young_size_total, self.young_gc_time))
output.append("Total Heap Before GC: {0}K, Total Heap After GC: {1}K (Total: {2}K), {3} secs".format(self.total_heap_before_gc, self.total_heap_after_gc, self.total_heap, self.total_gc_time))
return "\n".join(output)
def _parse_record(self):
"""This loops through record_array to set the class variables that make up the record."""
self.record_timestamp, record_array = self.raw_gc_record
#############################################################
# Capture STW (Full GC, remarks, etc.). Yeah, I could combine
# these three, but this is good enough for now.
if any("CMS Initial Mark" in line for line in record_array):
match = re.search(r", ([\d\.]+) secs\] ", record_array[-1])
if match:
self.gc_type = "CMS-STW"
self.is_stw_gc = True
self.valid_record = True
self.stw_time += float(match.group(1))
if any("CMS Final Remark" in line for line in record_array):
match = re.search(r", ([\d\.]+) secs\] ", record_array[-1])
if match:
self.gc_type = "CMS-STW"
self.is_stw_gc = True
self.valid_record = True
self.stw_time += float(match.group(1))
if any("Full GC" in line for line in record_array):
match = re.search(r", ([\d\.]+) secs\] ", record_array[-1])
if match:
self.gc_type = "FULL"
self.is_stw_gc = True
self.valid_record = True
self.stw_time += float(match.group(1))
if not self.is_stw_gc:
for line in record_array:
if "CMS-concurrent-sweep: " in line:
match = re.match(r"^\d+-\d+-\d+T\d+:\d+:[\d\.]+[+-]\d+: ([\d\.]+): \[CMS-concurrent-sweep: [\d\.]+/([\d\.]+) secs", line)
if match:
self.is_cms_gc = True
self.valid_record = True
self.gc_type = "CMS"
self.jvm_running_time = float(match.group(1))
self.cms_sweep_time = float(match.group(2))
break
if not (self.jvm_running_time or self.gc_type):
match = re.match(r"^\d+-\d+-\d+T\d+:\d+:[\d\.]+[+-]\d+: ([\d\.]+): .*\[(\S+)", line)
if match:
self.jvm_running_time = float(match.group(1))
self.gc_type = match.group(2)
if not (self.desired_survivor_size or self.curr_threshold or self.max_threshold):
match = re.match(r"^Desired survivor size (\d+) bytes, new threshold (\d+) \(max (\d+)\)", line)
if match:
self.valid_record = True
self.desired_survivor_size = int(match.group(1))
self.curr_threshold = int(match.group(2))
self.max_threshold = int(match.group(3))
# Here I set the survivor size beforehand, for any that
# may be missing as I want all the ages even if they aren't
# being used for comparison between GCs
for age in range(1, self.max_threshold + 1):
self.ages.append((age, -1, -1))
continue
################################################
# Skipping records when the JVM has been running
# for less than 300 seconds
if self.jvm_running_time < 300:
self.valid_record = False
break
#############################
# Capture survivor ages, etc.
match = re.match(r"^- age\s+(\d+):\s+(\d+) bytes,\s+(\d+) total", line)
if match:
############################################################
# This while logic block catches any ages that were
# fully reaped, and fills them with zeros. This is important
# as the analytics needs to know this to determine survivor
# death rates/ratios
age = int(match.group(1))
curr_size = int(match.group(2))
max_size = int(match.group(3))
self.ages[age - 1] = (age, curr_size, max_size)
continue
###############################
# Capture gc reallocation stats
match = re.match(r"^: (\d+)\w->(\d+)\w\((\d+)\w\), ([\d\.]+) secs\] (\d+)\w->(\d+)\w\((\d+)\w\), ([\d\.]+) secs\]", line)
if match:
self.young_size_before_gc = int(match.group(1)) * 1024
self.young_size_after_gc = int(match.group(2)) * 1024
self.young_size_total = int(match.group(3)) * 1024
self.young_gc_time = Decimal(match.group(4))
self.total_heap_before_gc = int(match.group(5)) * 1024
self.total_heap_after_gc = int(match.group(6)) * 1024
self.total_heap = int(match.group(7)) * 1024
self.total_gc_time = Decimal(match.group(8))
self.og_used = self.total_heap_after_gc - self.young_size_after_gc
def display(message=None, keep_newline=True, save_output=True):
"""Basically wraps the print function so that it will also save the output to an array for pasting
Keyword arguments:
message -- the message to print
keep_newline -- if this is True, then print it, otherwise, print with no newline (like print with a comma at the end)
save_output -- if this is false, do not save the output to an array for pasting
"""
# Not needed (using 'global'), but better to be explicit than not
global display_output
if save_output:
display_output.append(message)
if message.endswith("\n"):
message = message[:-1]
if keep_newline:
print message
else:
print message,
def liverun(cmd=None):
"""Run cmd, and return an iterator of said cmd.
Keyword arguments:
cmd -- the command to run
"""
global subproc
env = dict(os.environ)
# Combining stdout and stderr. I can't find a way to keep both separate
# while getting the data 'live'. itertools.izip_longest seemed like it'd
# almost do it, but it caches the results before sending it out...
subproc = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.STDOUT, env=env)
return iter(subproc.stdout.readline, b'')
def reduce_seconds(secs=None):
"""Return a compressed representation of time in seconds
Keyword arguments:
secs -- a float/int representing the seconds to be 'compressed'
"""
# The nested if statements keep it from being too long,
# by lopping off the non significant values
retval = ""
secs = int(float(secs))
mins, secs = divmod(secs, 60)
hours, mins = divmod(mins, 60)
days, hours = divmod(hours, 24)
secs = int("{0:0.0f}".format(secs))
if days:
retval += "{0}d".format(days)
if hours:
retval += "{0}h".format(hours)
if days > 0:
return retval
if mins:
retval += "{0}m".format(mins)
if hours or days:
return retval
if secs:
retval += "{0:}s".format(secs)
return retval
def sec_diff(first_time=None, second_time=None):
"""Return the number of seconds between two datetime objects
Keyword arguments:
first_time -- The (typically) older time of the two
second_time -- The (typically) newer time of the two
"""
time_delta = second_time - first_time
return time_delta.days * 86400 + time_delta.seconds + Decimal(str(time_delta.microseconds / float(1000000)))
def _min(values=None):
"""A wrapper around the min() function so that it does not error on an empty list"""
try:
return min(values)
except ValueError:
return 0
def _max(values=None):
"""A wrapper around the max() function so that it does not error on an empty list"""
try:
return max(values)
except ValueError:
return 0
def median(values=None):
"""Return the median of 'values'
Keyword arguments:
values -- the list of numbers
"""
sorts = sorted(values)
length = len(sorts)
result = None
if not values:
result = 0
# raise ValueError, "I can't find the median of an empty list."
elif not length % 2:
result = (sorts[(length / 2)] + sorts[(length / 2) - 1]) / 2.0
else:
result = sorts[length / 2]
return result
def mean(values=None, _length=None):
"""Return the mean of 'values'
Keyword arguments:
values -- the list of numbers
_length -- mostly not usable for end-users, needed by the stdev function
"""
result = None
if not _length:
_length = len(values)
if _length > 0:
result = Decimal(str(sum(values))) / _length
else:
result = 0
return result
def stdev(values=None):
"""Return the standard deviation of values
Keyword arguments:
values -- The poorly named argument that contains the list of numbers
"""
values_mean = mean(values)
variance = map(lambda x: math.pow(Decimal(str(x)) - values_mean, 2), values)
return math.sqrt(mean(variance, len(variance) - 1))
def percentile(values=None, pct=None):
"""Return the percentile of a given values
Keyword arguments:
values -- The list of numbers to be analyzed
pct -- The percentile (can be a float) to be used (100 == 100%, not 1 = 100%, etc.)
"""
watermark_index = int(round((float(pct) / 100) * len(values) + .5))
watermark = sorted(values)[watermark_index - 1]
return [element for element in values if element <= watermark]
def ord_num(number=None):
return str(number) + ("th" if 4 <= number % 100 <= 20 else {1: "st", 2: "nd", 3: "rd"}.get(number % 10, "th"))
def reduce_k(size=None, precision=2, short_form=True, _place_holder=0):
"""Return a compressed representation of a given number of bytes
Keyword arguments:
size -- the size in bytes
precision -- what precision should be used (places to the right of the decimal)
short_form -- (true/false). Use 'K' instead of 'KiB', etc.
"""
if not isinstance(size, Decimal):
size = Decimal(str(size))
# You know.. just in case we ever get to a yottabyte
if short_form:
iec_scale = ['K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y']
else:
iec_scale = ['KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']
if not isinstance(size, Decimal):
size = Decimal(str(size))
if abs(size) >= 1024:
_place_holder += 1
return reduce_k(size / Decimal("1024.0"), precision=precision, short_form=short_form, _place_holder=_place_holder)
else:
value = Decimal("{0:.{1}f}".format(size, precision))
if Decimal(str(int(value))) == value:
value = int(value)
if short_form:
return "{0}{1}".format(value, iec_scale[_place_holder])
else:
return "{0} {1}".format(value, iec_scale[_place_holder])
def _run_analysis(gc_data=None, jmap_data=None, jstat_data=None, proc_details=None, replay_file=None, optimized_for_ygcs_rate=None):
"""The meat-and-potatoes of this tool. This takes in numerous data structures,
and prints out a report of the analysis of them."""
# Formulas to get the JVM configuration just from JMap:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# eden_size == (newsize * survivor_ratio)/(2 + survivor_ratio)
# survivor_size == eden_size * (1/survivor_ratio)
# og_size == max_heap_size - eden_size - survivor_size
# og_used == heap_used - eden_used
if not gc_data:
logger.error("I can't do any analysis for this sample period because there wasn't enough data in the GC log. Exiting.")
sys.exit(1)
############################################################
# Get some summary data that doesn't require GC log analysis
textwrap_offset = 80
# Loop through the GC data array to find all CMS events, and capture
# how long they took.
cms_times = [record.cms_sweep_time for record in gc_data if record.is_cms_gc]
display("\n")
display("Meta:\n")
display("~~~~~\n")
sample_time_secs = sec_diff(gc_data[0].record_timestamp, gc_data[-1].record_timestamp)
if sample_time_secs < 60:
display("GC Sample Time: {0} seconds\n".format(sample_time_secs))
else:
display("GC Sample Time: {0} ({1} seconds)\n".format(reduce_seconds(sample_time_secs), sample_time_secs))
display("GC Sample Time from {0} to {1}\n".format(gc_data[0].record_timestamp, gc_data[-1].record_timestamp))
if proc_details:
cpu_count = mp.cpu_count()
cpu_uptime = cpu_count * proc_details['sys_uptime_seconds']
proc_utime_pct = proc_details['proc_utime_seconds'] / cpu_uptime
proc_stime_pct = proc_details['proc_stime_seconds'] / cpu_uptime
display("System Uptime: {0}\n".format(reduce_seconds(proc_details['sys_uptime_seconds'])))
display("Proc Uptime: {0}\n".format(reduce_seconds(proc_details['proc_uptime_seconds'])))
display("Proc Usertime: {0} ({1:0.2%})\n".format(reduce_seconds(proc_details['proc_utime_seconds']), proc_utime_pct))
display("Proc Systime: {0} ({1:0.2%})\n".format(reduce_seconds(proc_details['proc_stime_seconds']), proc_stime_pct))
display("Proc RSS: {0}\n".format(reduce_k(proc_details['proc_rss_bytes'] / 1024)))
display("Proc VSize: {0}\n".format(reduce_k(proc_details['proc_vsize_bytes'] / 1024)))
display("Proc # Threads: {0}\n".format(proc_details['num_threads']))
display("\n")
# Exit out as I don't have enough gc_data to do any analysis on
if len(gc_data) < 2:
display("\n")
display("* NOTE: There wasn't enough data to do any analysis. Please let the tool\n")
display(" gather at least 2 complete gc.log records (found {0}).\n".format(len(gc_data)))
return False
survivor_info = dict()
young_gc_count_delta = len([record.is_stw_gc for record in gc_data if not record.is_stw_gc])
full_gc_count_delta = len([record.is_stw_gc for record in gc_data if record.is_stw_gc])
jvm_uptime = gc_data[-1].jvm_running_time
sample_gc_time = sum(record.total_gc_time for record in gc_data)
sample_gc_load = (sample_gc_time / Decimal(str(sample_time_secs))) * 100
#######################################################
# Get young gen allocation rates over the sample period
yg_rates = list()
for first_gc, second_gc in zip(gc_data, gc_data[1:]):
if any([second_gc.is_stw_gc, first_gc.is_stw_gc, first_gc.is_cms_gc, second_gc.is_cms_gc]):
continue
# Iterate over the gc logs 2 at a time
# [1, 2, 3, 4] ->
# [(1, 2), (2, 3), (3, 4)]
#
time_delta = sec_diff(first_gc.record_timestamp, second_gc.record_timestamp)
try:
yg_size_delta = second_gc.young_size_before_gc - first_gc.young_size_after_gc
yg_growth_delta = second_gc.young_size_after_gc - first_gc.young_size_after_gc
except TypeError:
display("\n".join(textwrap.wrap("Warning: Something is really wrong with this JVM; I couldn't get correct GC data for it.", textwrap_offset)))
display("")
yg_size_delta = 0
yg_growth_delta = 0
# These are in KiB/s
yg_alloc_rate = yg_size_delta / time_delta
yg_growth_rate = yg_growth_delta / time_delta
yg_rates.append((yg_alloc_rate, yg_growth_rate))
#####################################################
# Get old gen promotion rates over the sample period
og_rates = list()
for first_gc, second_gc in zip(gc_data, gc_data[1:]):
if any([second_gc.is_stw_gc, first_gc.is_stw_gc, first_gc.is_cms_gc, second_gc.is_cms_gc]):
continue
time_delta = sec_diff(first_gc.record_timestamp, second_gc.record_timestamp)
# These are in KiB/s
og_allocation_delta = (second_gc.og_used - first_gc.og_used) / Decimal("1024")
og_allocation_rate = og_allocation_delta / time_delta
############################################################################
# I only want when the old gen is growing. If it's decreasing, it's probably
# b/c there was a FGC, and space is being reclaimed.
if og_allocation_delta > 0:
# This is in KiB/s
og_rates.append(og_allocation_rate)
############################
# Calc survivor death ratios
gc_survivor_death_rates = list()
for first_gc, second_gc in zip(gc_data, gc_data[1:]):
if any([second_gc.is_stw_gc, first_gc.is_stw_gc, first_gc.is_cms_gc, second_gc.is_cms_gc]):
continue
survivor_death_rates = list()
for first_age, second_age in zip(first_gc.ages, second_gc.ages[1:]):
# The second age CAN be bigger than the first age. I verified
# this in the gc.logs (still not sure how/why)
# ID 0 is the age number
# ID 1 is bytes in that age
# ID 2 is the total bytes for that age
if second_age[1] == -1:
# I don't think I want to capture any changes if
# the survivor space didn't exist (-1 as a default value- see above)
continue
# survivor_death_rates.append(Decimal(0))
else:
survivor_death_rates.append(1 - (Decimal(second_age[1]) / first_age[1]))
gc_survivor_death_rates.append(survivor_death_rates)
#################################################################################
# Since I have 2 in-scope valid GCs, I'm going to calculate some needed JVM sizes
# the sizes will be fixed if I have a fixed heap size (which we do in prod)
jvm_mem_cfg = dict()
try:
jvm_mem_cfg["og_size"] = (first_gc.total_heap - first_gc.young_size_total) * 1024
except TypeError:
display("\n".join(textwrap.wrap("Error: I could not find a non CMS/FGC GC record for analysis. Exiting.", textwrap_offset)))
display("")
sys.exit(1)
jvm_mem_cfg["survivor_size"] = (first_gc.desired_survivor_size * 2)
jvm_mem_cfg["eden_size"] = (first_gc.young_size_total * 1024) - jvm_mem_cfg["survivor_size"]
jvm_mem_cfg["total_heap"] = (first_gc.total_heap * 1024) + jvm_mem_cfg["survivor_size"]
jvm_mem_cfg["new_size"] = (jvm_mem_cfg["eden_size"] + (jvm_mem_cfg["survivor_size"] * 2))
#########################################################
# Now that I have a crap-ton of curated data, report out.
# This grabs the first part of the tuple (which is
# the total allocation for that gc (not growth!)
yg_alloc_rates = [entry[0] for entry in yg_rates]
min_yg_rate, mean_yg_rate, max_yg_rate = _min(yg_alloc_rates), mean(yg_alloc_rates), _max(yg_alloc_rates)
display("YG Allocation Rates*:\n")
display("~~~~~~~~~~~~~~~~~~~~~\n")
display("per sec (min/mean/max): {0:>13} {1:>13} {2:>13}\n".format(reduce_k(min_yg_rate) + "/s", reduce_k(mean_yg_rate) + "/s", reduce_k(max_yg_rate) + "/s"))
display("per hr (min/mean/max): {0:>13} {1:>13} {2:>13}\n".format(reduce_k(min_yg_rate * 3600) + "/h", reduce_k(mean_yg_rate * 3600) + "/h", reduce_k(max_yg_rate * 3600) + "/h"))
display("\n")
# This grabs the second part of the tuple (which is
# the total growth for that gc (not allocation rate!)
min_og_rate, mean_og_rate, max_og_rate = _min(og_rates), mean(og_rates), _max(og_rates)
display("OG Promotion Rates:\n")
display("~~~~~~~~~~~~~~~~~~~\n")
display("per sec (min/mean/max): {0:>13} {1:>13} {2:>13}\n".format(reduce_k(min_og_rate) + "/s", reduce_k(mean_og_rate) + "/s", reduce_k(max_og_rate) + "/s"))
display("per hr (min/mean/max): {0:>13} {1:>13} {2:>13}\n".format(reduce_k(min_og_rate * 3600) + "/h", reduce_k(mean_og_rate * 3600) + "/h", reduce_k(max_og_rate * 3600) + "/h"))
display("\n")
################################################
# Survivor Lengths- wanted to make a nested list
# comprehension, but I suppose that's a bit ugly
# to debug/read
display("Survivor Death Rates:\n")
display("~~~~~~~~~~~~~~~~~~~~~\n")
survivor_lengths = list()
for sub_arr in gc_survivor_death_rates:
survivor_lengths.append(len([elem for elem in sub_arr if elem > 0]))
display("Lengths (min/mean/max): {0}/{1:0.1f}/{2}\n".format(_min(survivor_lengths), mean(survivor_lengths), _max(survivor_lengths)))
display("Death Rate Breakdown:\n")
cuml_pct = 1
death_ages = list()
for survivor_num, pct_list in enumerate(izip_longest(*gc_survivor_death_rates, fillvalue=0), 1):
min_pct = min(pct_list)
mean_pct = mean(pct_list)
max_pct = max(pct_list)
cuml_pct *= 1 - mean_pct
death_ages.append(mean_pct)
survivor_info[survivor_num] = min_pct, mean_pct, max_pct
display(" Age {0}: {1:>5} / {2:>5} / {3:>5} / {4:>5} (min/mean/max/cuml alive %)\n".format(survivor_num, "{0:0.1%}".format(min_pct), "{0:0.1%}".format(mean_pct), "{0:0.1%}".format(max_pct),
"{0:0.1%}".format(cuml_pct)))
##################################
# GC Times
young_gc_times = [record.young_gc_time * 1000 for record in gc_data if not record.is_stw_gc]
full_gc_times = [record.stw_time * 1000 for record in gc_data if record.is_stw_gc]
if sample_time_secs:
if young_gc_count_delta:
ygc_rate = (young_gc_count_delta / sample_time_secs) * 60
else:
ygc_rate = 0
if full_gc_count_delta:
fgc_rate = (full_gc_count_delta / sample_time_secs) * 60
else:
fgc_rate = 0
display("\n")
display("GC Information:\n")
display("~~~~~~~~~~~~~~~\n")
display("YGC/FGC Count: {0}/{1} (Rate: {2:0.2f}/min, {3:0.2f}/min)\n".format(young_gc_count_delta, full_gc_count_delta, ygc_rate, fgc_rate))
display("\n")
display("Sample Period GC Load: {0:0.2f}%\n".format(sample_gc_load))
display("")
display("CMS Sweep Times: {0:0.3f}s / {1:0.3f}s / {2:0.3f}s / {3:0.2f} (min/mean/max/stdev)\n".format(_min(cms_times), mean(cms_times), _max(cms_times), stdev(cms_times)))
display("YGC Times: {0:0.0f}ms / {1:0.0f}ms / {2:0.0f}ms / {3:0.2f} (min/mean/max/stdev)\n".format(_min(young_gc_times), mean(young_gc_times), _max(young_gc_times), stdev(young_gc_times)))
display("FGC Times: {0:0.0f}ms / {1:0.0f}ms / {2:0.0f}ms / {3:0.2f} (min/mean/max/stdev)\n".format(_min(full_gc_times), mean(full_gc_times), _max(full_gc_times), stdev(full_gc_times)))
agg_ygc_time = sum(young_gc_times)
agg_fgc_time = sum(full_gc_times)
display("Agg. YGC Time: {0:0.0f}ms\n".format(agg_ygc_time))
display("Agg. FGC Time: {0:0.0f}ms\n".format(agg_fgc_time))
display("\n")
if og_rates:
display(
"Est. Time Between FGCs (min/mean/max): {0:>10} {1:>10} {2:>10}\n".format(reduce_seconds(jvm_mem_cfg["og_size"] / min_og_rate), reduce_seconds(jvm_mem_cfg["og_size"] / mean_og_rate),
reduce_seconds(jvm_mem_cfg["og_size"] / max_og_rate)))
else:
display("Est. Time Between FGCs (min/mean/max): {0:>10} {1:>10} {2:>10}\n".format("n/a", "n/a", "n/a"))
display("Est. OG Size for 1 FGC/hr (min/mean/max): {0:>10} {1:>10} {2:>10}\n".format(reduce_k(min_og_rate * 3600), reduce_k(mean_og_rate * 3600), reduce_k(max_og_rate * 3600)))
display("\n")
display("Overall JVM Efficiency Score*: {0:0.3f}%\n".format(100 - sample_gc_load))
display("\n")
###################################
# JMap Data
display("Current JVM Mem Configuration:\n")
display("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
if jmap_data:
for k, v in jmap_data.iteritems():
if "Size" in k:
v = reduce_k(v / 1024)
display("{0:>17}: {1}\n".format(k, v))
else:
for k, v in jvm_mem_cfg.iteritems():
display("{0:>17}: {1}\n".format(k, reduce_k(v / 1024)))
display("\n")
######################
# Show recommendations
_show_recommendations(death_ages, young_gc_times, full_gc_times, fgc_rate, ygc_rate, yg_alloc_rates, og_rates, jvm_mem_cfg, jmap_data, jstat_data, gc_data, cms_times, survivor_info,
optimized_for_ygcs_rate, proc_details)
display("~~~\n")
display("* The allocation rate is the increase in usage before a GC done. Growth rate\n")
display(" is the increase in usage after a GC is done.\n")
display("\n")
display("* The JVM efficiency score is a convenient way to quantify how efficient the\n")
display(" JVM is. The most efficient JVM is 100% (pretty much impossible to obtain).\n")
if full_gc_count_delta == 0:
display("\n")
display("* There were no full GCs during this sample period. This reporting will\n")
display(" be less useful/accurate as a result.\n")
display("\n")
display("* A copy of the critical data used to generate this report is stored\n")
display(" in /tmp/jtune_data-{0}.bin.bz2. Please copy this to your homedir if you\n".format(user))
display(" want to save/analyze this further.\n")
def _get_survivor_info(death_ages=None, survivor_info=None, gc_data=None, survivor_problem_pct=None, curr_ng_size=None, adj_ng_size=None):
"""This looks at the survivor info data structure, and will return the max
tenuring size, and max tenuring age that it feels is needed."""
# This is roughly how much larger the survivor space should be to counteract the increase
# in the frequency of ygcs caused from the smaller NG size as it pushes data into the
# survivor space more often. I don't need to change the MaxTenuringThreshold as that is
# mostly constant depending on how data ages.
#
# I'm adjusting the size of the survivor space based on the eden change. It MAY be better
# adjusting this based on time of how frequent the ygcs are happening.
ng_size_delta = curr_ng_size - adj_ng_size
# Going to use this to change the maxtenuringtrheshold parameter. The reason is that
# ygcs will happen less/more often if I change the ng size, and I'll need to counter
# that by increasing/decreasing the tenuring threshold to keep things in balance.
ng_size_delta_pct = adj_ng_size / curr_ng_size
# Changing the 'survivor_problem_pct' which is the watermark
# for objects still alive. If it's over that amount, then the
# tenuring threshold needs to be increased, if it's less, then
# the age is good. HOWEVER, I use death rate, so[-1] a 85% death
# rate is a 15% survivor rate.
survivor_watermark = 100 - survivor_problem_pct
# Get the max survivor age allowed per the jvm configuration
max_survivor_age = gc_data[0].max_threshold
# The survivor_info structure is the decrease in size for that
# age going into the next, so if the max here is 6, the actual max
# survivor size used is 7.
longest_used_ratio = len(survivor_info) + 1
# Survivor percentage of surviving objects
age_objects_still_alive = list()
current_percentage = 100
for key in sorted(survivor_info):
# [1] is the average, [2] is the max
mean_death_rate_pct = survivor_info[key][1]
current_percentage *= 1 - mean_death_rate_pct
age_objects_still_alive.append(current_percentage)
error_msg = None
if max_survivor_age < 15:
if longest_used_ratio == max_survivor_age:
if age_objects_still_alive[-1] > ((100 - survivor_watermark) / 100.0):
error_msg = "The survivor ratio of {0} is too small as {1:0.1f}% of the objects are still alive. Try increasing the MaxTenuringThreshold (Max: 15) parameter, and running this analysis again.".format(
longest_used_ratio, age_objects_still_alive[-1])
elif not survivor_info:
error_msg = "For the examined sample period, I could not retrieve any meaningful survivor statistics from the gc.log. This JVM is either sick, or the sample period was too short."
elif not survivor_info:
error_msg = "For the examined sample period, I could not retrieve any meaningful survivor statistics from the gc.log. This JVM is either sick, or the sample period was too short."
elif not survivor_info:
error_msg = "For the examined sample period, I could not retrieve any meaningful survivor statistics from the gc.log. This JVM is either sick, or the sample period was too short."
if error_msg:
raise ValueError(error_msg)
###########################################################
# Don't confuse the 'min()' with the 'max' variable. I want
# the first age where it's less than survivor_problem_pct
try:
max_tenuring_age = min([k for k, v in enumerate(age_objects_still_alive, 1) if v <= survivor_problem_pct])
except ValueError:
max_tenuring_age = 0
if not max_tenuring_age:
# Not sure if I like this algorithm, but it seems close enough
below_threshold_ct = len([death_pct for death_pct in death_ages if death_pct <= Decimal(".04")])
below_threshold_pct = below_threshold_ct / float(len(death_ages))
# If more than 33% of the ages are at or below 4%, make a note of it.
if below_threshold_pct > .33:
# It's speculative that I should add to the heap any objects that aren't reaped
# after cutting off the MaxTenuringThrehold, but since it's not getting reaped anyway,
# it may not change anything, so not adjusting for the time being.
# We're using all the available ages, but objects are still alive...
if max_survivor_age == len(death_ages):
display("\n".join(textwrap.wrap(
"* Warning: It looks like your tenuring threshold is too high - {0:0.0%} of your ages are reaping at or below 4% of the objects. We could make it easier for the JVM if we reduce your MaxTenuringThreshold by {1} to {2} instead of {3}.".format(
below_threshold_pct, below_threshold_ct, len(death_ages) - below_threshold_ct, max_survivor_age))))
else:
display("\n".join(textwrap.wrap(
"* Warning: It looks like your tenuring threshold is too high - {0:0.0%} of your ages are reaping at or below 4% of the objects. We could make it easier for the JVM if we reduce your MaxTenuringThreshold by {1} to {2} instead of {3}. BE CAREFUL - your max *used* age in the gc.logs of {4} is less than the configured max age of {3} - make sure that you used a large enough sample size, and let the JVM go through 3 FGCs (option: '-s 3') and is being checked during peak traffic.".format(
below_threshold_pct, below_threshold_ct, len(death_ages) - below_threshold_ct, max_survivor_age, len(death_ages)))))
max_tenuring_age = len(death_ages) - below_threshold_ct
else:
display("\n".join(textwrap.wrap(
"* Warning: Your survivor age is too short, your last age of {0} has {1:0.2f}% of its objects still alive. Because of this, I'm unable to reliably determine how your objects are aging. Unset or increase the MaxTenuringThreshold (max: 15) to mitigate this problem.".format(
len(age_objects_still_alive), age_objects_still_alive[-1]))))
tenure_sizes = list()
for gc_record in gc_data:
try:
tenure_sizes.append(gc_record.ages[max_tenuring_age - 1][2])
except IndexError:
# I saw a gc record that doesn't have that age
# level, so skip it.
pass
# It's recommended to have the tenuring size 2x the max tenure size, I then
# add in the change in newgen (ng_size_delta) to offset the decrease/increase
# in newgen as calculated in this parent's function. The 'ng_size_delta / 2' is
# such that I increase the whole max_tenuring_size by ng_size_delta, but since
# there are two survivor spaces, I need to split the ng_size_delta by 2 for each
# survivor space
max_tenuring_size = (max(tenure_sizes) * 2) + (ng_size_delta / 2)
survivor_ratio = adj_ng_size / max_tenuring_size
# Checking if survivor space is LARGER than the newgen size
if survivor_ratio < 1:
display("\n".join(textwrap.wrap(
"* Warning: The calculated recommended survivor ratio of {0:0.2f} is less than 1. This is not possible, so I increased the size of newgen by {1}, and set the survivor ratio to 1. Try the tuning suggestions, and watch closely.\n".format(
survivor_ratio, reduce_k((max_tenuring_size - adj_ng_size) / 1024)), textwrap_offset)) + "\n\n")
# This is close, but still wrong. If I run into this condition, then I need to
# also fix the newgen size b/c the tenured size is based off of the newgen
# size before I knew there was an issue. I think this is probably close enough
# for now.
survivor_ratio = 1
adj_ng_size = max_tenuring_size
else:
adj_ng_size += max_tenuring_size
# Now, change the max tenuring age/threshold
max_tenuring_age *= (1 / ng_size_delta_pct)
return adj_ng_size, survivor_ratio, max_tenuring_size, max_tenuring_age
def _show_recommendations(death_ages=None, young_gc_times=None, full_gc_times=None, fgc_rate=None, ygc_rate=None, yg_alloc_rates=None, og_rates=None, jvm_mem_cfg=None, jmap_data=None, jstat_data=None,
gc_data=None, cms_times=None, survivor_info=None, optimized_for_ygcs_rate=None, proc_details=None):
"""This is where any jvm tuning recommendations happens."""
###########################################################################
# The basis of these recommendations are as follows:
#
# 1) More frequent YGCs which take less time is almost always better
# than less frequent YGCs, but taking longer; consistently slow is
# better than periodically slower
# 2) YGC times should have a low standard deviation(<= 5)
# 3) YGC times should be low (<= 50ms, ideally)
display("Recommendation Summary:\n")
display("~~~~~~~~~~~~~~~~~~~~~~~\n")
# This is how many ygcs/sec should be happening, if the mean ygc
# times are higher than desired
ygc_time_goal_ms = 50
ygc_stdev_goal = 5
# YGC mean ms percentile - lop off the worst offenders
# I am changing it instead of a mean of the 99p, doing a
# max of the 75p; may be better
ygc_pctile = 75
# This is just for analysis purposes; need a decent sample set count
ygc_count_goal = 10
fgc_count_goal = 3
# Marker for indicating if current config is good for
# the Java G1 garbage collector
ready_for_g1 = False
survivor_problem_pct = 10
ygc_stdev = stdev(percentile(young_gc_times, ygc_pctile))
ygc_mean_ms = float(max(percentile(young_gc_times, ygc_pctile)))
if jmap_data:
curr_ng_size = jmap_data['NewSize']
curr_og_size = jmap_data['OldSize']
# Not using b/c this data is not in the GC logs (and
# really doesn't need to be tuned...
# if "PermSize" in jmap_data:
# curr_pg_ms_size = jmap_data['PermSize']
# else:
# curr_pg_ms_size = jmap_data['MetaspaceSize']
max_heap_size = jmap_data['MaxHeapSize']
else:
curr_ng_size = jvm_mem_cfg["new_size"]
curr_og_size = jvm_mem_cfg["og_size"]
max_heap_size = jvm_mem_cfg["total_heap"]
adj_ng_size = curr_ng_size
#########################################################################################################
# This is an estimate. Because we use CMS for FGCs, it's an iterative process, and while the CMS reset is
# happening, more objects are being tenured into OG. The best we can do (I think) is to find the minimum
# size of OU, and go from there. This is why it's super important to have more than 2 FGCs to look at.
#
# This is tricky. I need to find the first record where the previous og size is bigger than
# the current. This identifies when the first CMS runs, and from there, I can find the minimum
normal_gc_data = filter(lambda x: x.og_used > 0, gc_data)
try:
record_num = [record_num for record_num, first_gc, second_gc in izip(count(), normal_gc_data, normal_gc_data[1:]) if first_gc.og_used > second_gc.og_used][0]
except IndexError:
live_data_size_bytes = None
else:
live_data_size_bytes = _min(record.og_used for record in normal_gc_data[record_num:])
if proc_details and proc_details['proc_uptime_seconds'] < 300:
display("\n".join(textwrap.wrap(
"Warning: The process I'm doing the analysis on has been up for {0}, and may not be in a steady-state. It's best to let it be up for more than 5 minutes to get more realistic results.\n".format(
reduce_seconds(proc_details['proc_uptime_seconds'])))) + "\n\n")
#################################################
# Find the recommended NewGen size
if len(young_gc_times) < ygc_count_goal:
display("\n".join(
textwrap.wrap("Warning: There were only {0} YGC entries to do the analysis on. It's better to have > {1} to get more realistic results.\n".format(len(young_gc_times), ygc_count_goal),
textwrap_offset)) + "\n\n")
if ygc_stdev > ygc_stdev_goal * 4:
comment = "VERY inconsistent"
elif ygc_stdev > ygc_stdev_goal * 2:
comment = "pretty inconsistent"
elif ygc_stdev > ygc_stdev_goal:
comment = "somewhat consistent"
ready_for_g1 = True
else:
comment = "very consistent"
ready_for_g1 = True
messages = list()
# This logic block goes through different optimizaion scenarios that it
# uses to find an optimal setting.
# messages.append("- The mean YGC rate is {0:0.2f}/min, and the max {1} percentile YGC time is {2:0.0f}ms (stdev of {3:0.2f} which is {4}). It's best to have the mean YGC time be at or below {5}ms, and the YGC stdev at or below {6} if possible.".format(ygc_rate, ord_num(ygc_pctile), ygc_mean_ms, ygc_stdev, comment, ygc_time_goal_ms, ygc_stdev_goal))
# TODO: Too much repetition in this code block
if (optimized_for_ygcs_rate > ygc_rate) and (ygc_stdev > ygc_stdev_goal or ygc_mean_ms > ygc_time_goal_ms):
adj_ng_size = curr_ng_size * (ygc_rate / optimized_for_ygcs_rate)