Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor ledd daemon & fix high CPU usage due to unexpected socket close #548

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 55 additions & 37 deletions sonic-ledd/scripts/ledd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

"""
ledd
Expand All @@ -15,7 +15,7 @@ from swsscommon import swsscommon

#============================= Constants =============================

VERSION = '1.0'
VERSION = '2.0'

SYSLOG_IDENTIFIER = "ledd"

Expand All @@ -33,10 +33,10 @@ LED_CLASS_NAME = "LedControl"
SELECT_TIMEOUT = 1000

LEDUTIL_LOAD_ERROR = 1
LEDUTIL_RUNTIME_ERROR = 2


class DaemonLedd(daemon_base.DaemonBase):

def __init__(self):
daemon_base.DaemonBase.__init__(self, SYSLOG_IDENTIFIER)

Expand All @@ -56,50 +56,67 @@ class DaemonLedd(daemon_base.DaemonBase):
namespaces = multi_asic.get_front_end_namespaces()

# Subscribe to PORT table notifications in the Application DB
appl_db = {}
self.sst = {}
self.tables = {}
self.sel = swsscommon.Select()

for namespace in namespaces:
# Open a handle to the Application database, in all namespaces
appl_db[namespace] = daemon_base.db_connect("APPL_DB", namespace=namespace)
self.sst[namespace] = swsscommon.SubscriberStateTable(appl_db[namespace], swsscommon.APP_PORT_TABLE_NAME)
self.sel.addSelectable(self.sst[namespace])

# Run daemon
def run(self):
# Use timeout to prevent ignoring the signals we want to handle
# in signal_handler() (e.g. SIGTERM for graceful shutdown)
(state, selectableObj) = self.sel.select(SELECT_TIMEOUT)

if state == swsscommon.Select.TIMEOUT:
# Do not flood log when select times out
return 1

if state != swsscommon.Select.OBJECT:
self.log_warning("sel.select() did not return swsscommon.Select.OBJECT")
return 2

self.subscribeDbTable("APPL_DB", swsscommon.APP_PORT_TABLE_NAME, namespace)

def connectDB(self, dbname, namespace):
db = daemon_base.db_connect(dbname, namespace=namespace)
return db

def subscribeDbTable(self, dbname, tblname, namespace):
db = self.connectDB(dbname, namespace)
self.tables[namespace] = swsscommon.SubscriberStateTable(db, tblname)
self.sel.addSelectable(self.tables[namespace])

def isFrontPanelPort(self, port_name):
return not port_name.startswith((backplane_prefix(), inband_prefix(), recirc_prefix()))

def updatePortLedColor(self, port_name, port_status):
self.led_control.port_link_state_change(port_name, port_status)

def getEventNamespace(self, selectObj):
# Get the corresponding namespace from redisselect db connector object
return selectObj.getDbConnector().getNamespace()

def getSelectObj(self, timeout=SELECT_TIMEOUT):
return self.sel.select(timeout)

def processPortTableEvent(self, selectableObj):
''' Process (if any) event from the PORT table in the Application DB '''
# Get the redisselect object from selectable object
redisSelectObj = swsscommon.CastSelectableToRedisSelectObj(selectableObj)
namespace = self.getEventNamespace(redisSelectObj)

# Get the corresponding namespace from redisselect db connector object
namespace = redisSelectObj.getDbConnector().getNamespace()

(key, op, fvp) = self.sst[namespace].pop()
(key, op, fvp) = self.tables[namespace].pop()
if fvp:
# TODO: Once these flag entries have been removed from the DB,
# we can remove this check
if key in ["PortConfigDone", "PortInitDone"]:
return 3
return

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we ignoring bad fvp or key not present case now?

fvp_dict = dict(fvp)

if op == "SET" and "oper_status" in fvp_dict:
if not key.startswith((backplane_prefix(), inband_prefix(), recirc_prefix())):
self.led_control.port_link_state_change(key, fvp_dict["oper_status"])
else:
return 4
if op == "SET" and "oper_status" in fvp_dict and self.isFrontPanelPort(key):
self.updatePortLedColor(key, fvp_dict["oper_status"])

# Run daemon
def run(self):
# Use timeout to prevent ignoring the signals we want to handle
# in signal_handler() (e.g. SIGTERM for graceful shutdown)
(state, selectableObj) = self.getSelectObj()

if state == swsscommon.Select.TIMEOUT:
# NOOP - Nothing to process here
return 0

if state != swsscommon.Select.OBJECT:
self.log_warning("sel.select() did not return swsscommon.Select.OBJECT - May be socket closed???")
return -1 ## Fail here so that the daemon can be restarted

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider return code = (128+non-zero-code) per all other daemons code.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@assrinivasan do you know the reason for 128+ ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is that we use 128+[non-zero code] so that supervisor can restart the daemon. Since your comment in line 118 indicates that the daemon would need to be restarted, I suggested this change.

self.processPortTableEvent(selectableObj)

return 0

Expand All @@ -126,8 +143,9 @@ def main():

# Listen indefinitely for changes to the PORT table in the Application DB's
while True:
ledd.run()

if 0 != ledd.run():
print("ledd.run() failed... Exiting")
sys.exit(LEDUTIL_RUNTIME_ERROR)

if __name__ == '__main__':
main()
main()
8 changes: 4 additions & 4 deletions sonic-ledd/tests/test_ledd.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_run_select_timeout(self, mock_select, mock_sst, mock_load_plat_util):

daemon_ledd = ledd.DaemonLedd()
ret = daemon_ledd.run()
assert ret == 1
assert ret == 0

@mock.patch("ledd.DaemonLedd.load_platform_util")
@mock.patch("ledd.swsscommon.SubscriberStateTable")
Expand All @@ -85,7 +85,7 @@ def test_run_bad_select_return(self, mock_select, mock_sst, mock_load_plat_util)

daemon_ledd = ledd.DaemonLedd()
ret = daemon_ledd.run()
assert ret == 2
assert ret == -1

@mock.patch("ledd.DaemonLedd.load_platform_util")
@mock.patch("ledd.swsscommon.CastSelectableToRedisSelectObj")
Expand All @@ -104,7 +104,7 @@ def test_run_ignore_keys(self, mock_select, mock_sst, mock_cstrso, mock_load_pla

daemon_ledd = ledd.DaemonLedd()
ret = daemon_ledd.run()
assert ret == 3
assert ret == 0

@mock.patch("ledd.DaemonLedd.load_platform_util")
@mock.patch("ledd.swsscommon.CastSelectableToRedisSelectObj")
Expand All @@ -123,7 +123,7 @@ def test_run_bad_fvp(self, mock_select, mock_sst, mock_cstrso, mock_load_plat_ut

daemon_ledd = ledd.DaemonLedd()
ret = daemon_ledd.run()
assert ret == 4
assert ret == 0

@mock.patch("ledd.DaemonLedd.load_platform_util")
@mock.patch("ledd.swsscommon.CastSelectableToRedisSelectObj")
Expand Down
Loading